#include <stdlib.h>
#include "expat.h"
#include <getopt.h>
+#include "disorder.h"
+#include "md5.h"
-#define BUFFER_SIZE 80
// timestamp of the form 2003-11-07T00:43:23Z
#define DATE_LENGTH 10
#define TIME_LENGTH 8
#define TIMESTAMP_LENGTH 20
+#define MEGABYTE 1048576
+#define FIELD_BUFFER_SIZE 1024
+// 2048 KB in bytes + 1
+//#define TEXT_BUFFER_SIZE 2097153
+//#define TEXT_BUFFER_SIZE 10485760
+
enum elements {
TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
typedef struct {
- struct {
- char *title;
- char *articleid;
- char *revid;
- char *date;
- char *time;
- char *timestamp;
- char *anon;
- char *editor;
- char *editorid;
- bool minor;
- char *comment;
- char *text;
- } rev;
+ // pointers to once-allocated buffers
+ char *title;
+ char *articleid;
+ char *revid;
+ char *date;
+ char *time;
+ char *timestamp;
+ char *anon;
+ char *editor;
+ char *editorid;
+ char *comment;
+ char *text;
+
+ // track string size of the elements, to prevent O(N^2) processing in charhndl
+ // when we have to take strlen for every character which we append to the buffer
+ size_t title_size;
+ size_t articleid_size;
+ size_t revid_size;
+ size_t date_size;
+ size_t time_size;
+ size_t timestamp_size;
+ size_t anon_size;
+ size_t editor_size;
+ size_t editorid_size;
+ size_t comment_size;
+ size_t text_size;
+
+ bool minor;
enum elements element;
enum block position;
static void
clean_data(revisionData *data, int title)
{
+ // reset title (if we are switching articles)
if (title) {
- data->rev.title = NULL;
- data->rev.articleid = NULL;
+ data->title[0] = '\0';
+ data->articleid[0] = '\0';
+ data->title_size = 0;
+ data->articleid_size = 0;
}
- data->rev.revid = NULL;
- data->rev.date = NULL;
- data->rev.time = NULL;
- data->rev.timestamp = NULL;
- data->rev.anon = NULL;
- data->rev.editor = NULL;
- data->rev.editorid = NULL;
- data->rev.minor = false;
- data->rev.comment = NULL;
- data->rev.text = NULL;
+
+ // reset text fields
+ data->revid[0] = '\0';
+ data->date[0] = '\0';
+ data->time[0] = '\0';
+ data->timestamp[0] = '\0';
+ data->anon[0] = '\0';
+ data->editor[0] = '\0';
+ data->editorid[0] = '\0';
+ data->comment[0] = '\0';
+ data->text[0] = '\0';
+
+ // reset length tracking
+ data->revid_size = 0;
+ data->date_size = 0;
+ data->time_size = 0;
+ data->timestamp_size = 0;
+ data->anon_size = 0;
+ data->editor_size = 0;
+ data->editorid_size = 0;
+ data->comment_size = 0;
+ data->text_size = 0;
+
+ // reset flags and element type info
+ data->minor = false;
data->element = UNUSED;
- //data->position =
+
}
+// presently unused
static void
free_data(revisionData *data, int title)
{
if (title) {
//printf("freeing article\n");
- free(data->rev.title);
- free(data->rev.articleid);
+ free(data->title);
+ free(data->articleid);
}
- free(data->rev.revid);
- free(data->rev.date);
- free(data->rev.time);
- free(data->rev.timestamp);
- free(data->rev.anon);
- free(data->rev.editor);
- free(data->rev.editorid);
- free(data->rev.comment);
- free(data->rev.text);
+ free(data->revid);
+ free(data->date);
+ free(data->time);
+ free(data->timestamp);
+ free(data->anon);
+ free(data->editor);
+ free(data->editorid);
+ free(data->comment);
+ free(data->text);
}
void cleanup_revision(revisionData *data) {
- free_data(data, 0);
clean_data(data, 0);
}
void cleanup_article(revisionData *data) {
- free_data(data, 1);
clean_data(data, 1);
}
static void
init_data(revisionData *data, outtype output_type)
{
- clean_data(data, 1); // sets every element to null...
+ data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'?
+ data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->title = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->date = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->time = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->minor = false;
+
+ // resets the data fields, null terminates strings, sets lengths
+ clean_data(data, 1);
+
data->output_type = output_type;
}
{
printf("element = %i\n", data->element);
printf("output_type = %i\n", data->output_type);
- printf("title = %s\n", data->rev.title);
- printf("articleid = %s\n", data->rev.articleid);
- printf("revid = %s\n", data->rev.revid);
- printf("date = %s\n", data->rev.date);
- printf("time = %s\n", data->rev.time);
- printf("anon = %s\n", data->rev.anon);
- printf("editor = %s\n", data->rev.editor);
- printf("editorid = %s\n", data->rev.editorid);
- printf("minor = %s\n", (data->rev.minor ? "1" : "0"));
- printf("comment = %s\n", data->rev.comment);
- printf("text = %s\n", data->rev.text);
+ printf("title = %s\n", data->title);
+ printf("articleid = %s\n", data->articleid);
+ printf("revid = %s\n", data->revid);
+ printf("date = %s\n", data->date);
+ printf("time = %s\n", data->time);
+ printf("anon = %s\n", data->anon);
+ printf("editor = %s\n", data->editor);
+ printf("editorid = %s\n", data->editorid);
+ printf("minor = %s\n", (data->minor ? "1" : "0"));
+ printf("comment = %s\n", data->comment);
+ printf("text = %s\n", data->text);
printf("\n");
}
write_row(revisionData *data)
{
- // TODO: make it so you can specify fields to output
- // note that date and time are separated by a space, to match postgres's
- // timestamp format
- printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
- (data->rev.title != NULL) ? data->rev.title : "",
- (data->rev.articleid != NULL) ? data->rev.articleid : "",
- (data->rev.revid != NULL) ? data->rev.revid : "",
- (data->rev.date != NULL) ? data->rev.date : "",
- (data->rev.time != NULL) ? data->rev.time : "",
- (data->rev.editor != NULL) ? "0" : "1",
- (data->rev.editor != NULL) ? data->rev.editor : "",
- (data->rev.editorid != NULL) ? data->rev.editorid : "",
- (data->rev.minor) ? "1" : "0");
- switch (data->output_type)
- {
- case SIMPLE:
- printf("\n");
- break;
- case FULL:
- printf("\t%s\t%s\n",
- (data->rev.comment != NULL) ? data->rev.comment : "",
- (data->rev.text != NULL) ? data->rev.text : "");
- break;
+ // get md5sum
+ md5_state_t state;
+ md5_byte_t digest[16];
+ char md5_hex_output[2 * 16 + 1];
+ md5_init(&state);
+ md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
+ md5_finish(&state, digest);
+ int di;
+ for (di = 0; di < 16; ++di) {
+ sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
}
-}
-
-static char
-*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
-{
- char *timeinstamp = ×tamp[DATE_LENGTH+1];
- strncpy(time_buffer, timeinstamp, TIME_LENGTH);
- time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
-}
-
-
-static char
-*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
-{
- strncpy(date_buffer, timestamp, DATE_LENGTH);
- date_buffer[DATE_LENGTH] = '\0';
-}
-
-char
-*append(char *entry, char *newstr)
-{
- char *newbuff;
- int len;
- len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
- newbuff = (char*) realloc(entry, len);
- strcat(newbuff, newstr);
- return newbuff;
-}
-
-char
-*cache(char *entry, char *newstr)
-{
- char *newbuff;
- int len;
- len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
- newbuff = (char*) malloc(len);
- strcpy(newbuff,newstr);
- return newbuff;
-
-}
+ // print line of tsv output
+ printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n",
+ data->title,
+ data->articleid,
+ data->revid,
+ data->date,
+ data->time,
+ (data->editor[0] != '\0') ? "0" : "1", // anon?
+ data->editor,
+ data->editorid,
+ (data->minor) ? "1" : "0",
+ (unsigned int) data->text_size,
+ shannon_H(data->text, data->text_size),
+ md5_hex_output
+ );
+
+ //
+ if (data->output_type == FULL) {
+ printf("comment:%s\ntext:\n%s\n", data->comment, data->text);
+ }
-char
-*store(char *entry, char *newstr)
-{
- char *newbuff;
- if (entry == NULL)
- newbuff = cache(entry, newstr);
- else
- newbuff = append(entry, newstr);
- return newbuff;
}
void
split_timestamp(revisionData *data)
{
- char *t = data->rev.timestamp;
- char date_buffer[DATE_LENGTH+1];
- char time_buffer[TIME_LENGTH+1];
- datestr(t, date_buffer);
- timestr(t, time_buffer);
- data->rev.date = store(data->rev.date, date_buffer);
- data->rev.time = store(data->rev.time, time_buffer);
+ char *t = data->timestamp;
+ strncpy(data->date, data->timestamp, DATE_LENGTH);
+ char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
+ strncpy(data->time, timeinstamp, TIME_LENGTH);
}
-/* currently unused */
-static int
-is_whitespace(char *string) {
- int len = strlen(string);
- while (isspace(string[0]) && strlen(string) > 0) {
- string++;
- }
- if (strcmp(string, "") == 0)
- return 1;
- else
- return 0;
-}
+// like strncat but with previously known length
+char*
+strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
+{
+ //size_t dest_len = strlen(dest);
+ size_t i;
-static void
-squeeze(char *s, int c) {
- int i, j;
- for (i = j = 0; s[i] != '\0'; i++)
- if (s[i] != c)
- s[j++] = s[i];
- s[j] = '\0';
-}
+ for (i = 0 ; i < n && src[i] != '\0' ; i++)
+ dest[dest_len + i] = src[i];
+ dest[dest_len + i] = '\0';
-int
-contains(char *s, char *t)
-{
- char c = t[0]; //just get the first character of t
- int i = 0;
- while (s[i] != '\0') {
- if (s[i] == c)
- return 1;
- i++;
- }
+ return dest;
}
static void
{
revisionData* data = (revisionData*) vdata;
if (data->element != UNUSED && data->position != SKIP) {
- char t[len];
- strncpy(t,s,len);
- t[len] = '\0'; // makes t a well-formed string
+ //char t[len];
+ //strncpy(t,s,len);
+ //t[len] = '\0'; // makes t a well-formed string
switch (data->element) {
+ case TEXT:
+ // printf("buffer length = %i, text: %s\n", len, t);
+ strlcatn(data->text, s, data->text_size, len);
+ data->text_size += len;
+ break;
+ case COMMENT:
+ strlcatn(data->comment, s, data->comment_size, len);
+ data->comment_size += len;
+ break;
case TITLE:
- {
- data->rev.title = store(data->rev.title, t);
- // skip any articles with bad characters in their titles
+ strlcatn(data->title, s, data->title_size, len);
+ data->title_size += len;
break;
- }
case ARTICLEID:
// printf("articleid = %s\n", t);
- data->rev.articleid = store(data->rev.articleid, t);
+ strlcatn(data->articleid, s, data->articleid_size, len);
+ data->articleid_size += len;
break;
case REVID:
// printf("revid = %s\n", t);
- data->rev.revid = store(data->rev.revid, t);
+ strlcatn(data->revid, s, data->revid_size, len);
+ data->revid_size += len;
break;
case TIMESTAMP:
- data->rev.timestamp = store(data->rev.timestamp, t);
- if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
+ strlcatn(data->timestamp, s, data->timestamp_size, len);
+ data->timestamp_size += len;
+ if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
split_timestamp(data);
break;
- case EDITOR: {
- data->rev.editor = store(data->rev.editor, t);
+ case EDITOR:
+ strlcatn(data->editor, s, data->editor_size, len);
+ data->editor_size += len;
break;
- }
case EDITORID:
//printf("editorid = %s\n", t);
- data->rev.editorid = store(data->rev.editorid, t);
+ strlcatn(data->editorid, s, data->editorid_size, len);
+ data->editorid_size += len;
break;
/* the following are implied or skipped:
case MINOR:
break; minor tag is just a tag
case UNUSED:
*/
- case COMMENT:
- // printf("row: comment is %s\n", t);
- data->rev.comment = store(data->rev.comment, t);
- break;
- case TEXT:
- data->rev.text = store(data->rev.text, t);
- break;
default: break;
}
}
// minor tag has no character data, so we parse here
else if (strcmp(name,"minor") == 0) {
data->element = MINOR;
- data->rev.minor = true;
+ data->minor = true;
}
else if (strcmp(name,"timestamp") == 0)
data->element = TIMESTAMP;
fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
fprintf(stderr, "\n");
- fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
+ fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n");
fprintf(stderr, "\n");
fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
}
output_type = SIMPLE;
char c;
- while ((c = getopt(argc, argv, "hsd")) != -1)
+ while ((c = getopt(argc, argv, "ht")) != -1)
switch (c)
{
case 'd':
}
// create a new instance of the expat parser
- XML_Parser parser = XML_ParserCreate(NULL);
+ XML_Parser parser = XML_ParserCreate("UTF-8");
// initialize the user data struct which is passed to callback functions
revisionData data;
// sets start and end to be the element start and end handlers
XML_SetElementHandler(parser, startFnPtr, endFnPtr);
- // sets charhndl to be the callback for raw character data
+ // sets charhndl to be the callback for character data
XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
- int done;
+ bool done;
char buf[BUFSIZ];
- write_header();
-
// shovel data into the parser
do {
// read into buf a bufferfull of data from standard input
- size_t len = fread(buf, 1, sizeof(buf), stdin);
- done = len < sizeof(buf); // checks if we've got the last bufferfull
+ size_t len = fread(buf, 1, BUFSIZ, stdin);
+ done = len < BUFSIZ; // checks if we've got the last bufferfull
// passes the buffer of data to the parser and checks for error
// (this is where the callbacks are invoked)