#include <stdlib.h>
#include "expat.h"
#include <getopt.h>
+#include "disorder.h"
-#define BUFFER_SIZE 80
// timestamp of the form 2003-11-07T00:43:23Z
#define DATE_LENGTH 10
#define TIME_LENGTH 8
#define TIMESTAMP_LENGTH 20
+#define MEGABYTE 1048576
+#define FIELD_BUFFER_SIZE 1024
+// 2048 KB in bytes + 1
+//#define TEXT_BUFFER_SIZE 2097153
+//#define TEXT_BUFFER_SIZE 10485760
+
enum elements {
TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
-enum outtype { NORMAL, SIMPLE };
+enum outtype { FULL, SIMPLE };
typedef struct {
- struct {
- char *title;
- char *articleid;
- char *revid;
- char *date;
- char *time;
- char *timestamp;
- char *anon;
- char *editor;
- char *editorid;
- char *minor;
- char *comment;
- char *text;
- } rev;
+ // pointers to once-allocated buffers
+ char *title;
+ char *articleid;
+ char *revid;
+ char *date;
+ char *time;
+ char *timestamp;
+ char *anon;
+ char *editor;
+ char *editorid;
+ char *comment;
+ char *text;
+
+ // track string size of the elements, to prevent O(N^2) processing in charhndl
+ // when we have to take strlen for every character which we append to the buffer
+ size_t title_size;
+ size_t articleid_size;
+ size_t revid_size;
+ size_t date_size;
+ size_t time_size;
+ size_t timestamp_size;
+ size_t anon_size;
+ size_t editor_size;
+ size_t editorid_size;
+ size_t comment_size;
+ size_t text_size;
+
+ bool minor;
- char *dropstr;
enum elements element;
enum block position;
enum outtype output_type;
-} parseData;
+} revisionData;
/* free_data and clean_data
* Also, frees memory dynamically allocated to store data.
*/
static void
-clean_data(parseData *data, int title)
+clean_data(revisionData *data, int title)
{
+ // reset title (if we are switching articles)
if (title) {
- data->rev.title = NULL;
- data->rev.articleid = NULL;
+ data->title[0] = '\0';
+ data->articleid[0] = '\0';
+ data->title_size = 0;
+ data->articleid_size = 0;
}
- data->rev.revid = NULL;
- data->rev.date = NULL;
- data->rev.time = NULL;
- data->rev.timestamp = NULL;
- data->rev.anon = NULL;
- data->rev.editor = NULL;
- data->rev.editorid = NULL;
- data->rev.minor = NULL;
- data->rev.comment = NULL;
- data->rev.text = NULL;
+
+ // reset text fields
+ data->revid[0] = '\0';
+ data->date[0] = '\0';
+ data->time[0] = '\0';
+ data->timestamp[0] = '\0';
+ data->anon[0] = '\0';
+ data->editor[0] = '\0';
+ data->editorid[0] = '\0';
+ data->comment[0] = '\0';
+ data->text[0] = '\0';
+
+ // reset length tracking
+ data->revid_size = 0;
+ data->date_size = 0;
+ data->time_size = 0;
+ data->timestamp_size = 0;
+ data->anon_size = 0;
+ data->editor_size = 0;
+ data->editorid_size = 0;
+ data->comment_size = 0;
+ data->text_size = 0;
+
+ // reset flags and element type info
+ data->minor = false;
data->element = UNUSED;
- //data->position =
+
}
+// presently unused
static void
-free_data(parseData *data, int title)
+free_data(revisionData *data, int title)
{
if (title) {
//printf("freeing article\n");
- free(data->rev.title);
- free(data->rev.articleid);
+ free(data->title);
+ free(data->articleid);
}
- free(data->rev.revid);
- free(data->rev.date);
- free(data->rev.time);
- free(data->rev.timestamp);
- free(data->rev.anon);
- free(data->rev.editor);
- free(data->rev.editorid);
- free(data->rev.minor);
- free(data->rev.comment);
- free(data->rev.text);
+ free(data->revid);
+ free(data->date);
+ free(data->time);
+ free(data->timestamp);
+ free(data->anon);
+ free(data->editor);
+ free(data->editorid);
+ free(data->comment);
+ free(data->text);
}
-cleanup_revision(parseData *data) {
- free_data(data, 0);
+void cleanup_revision(revisionData *data) {
clean_data(data, 0);
}
-cleanup_article(parseData *data) {
- free_data(data, 1);
+void cleanup_article(revisionData *data) {
clean_data(data, 1);
}
static void
-init_data(parseData *data, char *dropstr, int output_type)
+init_data(revisionData *data, outtype output_type)
{
- clean_data(data, 1); // sets every element to null...
- data->dropstr = dropstr;
+ data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'?
+ data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->title = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->date = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->time = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->minor = false;
+
+ // resets the data fields, null terminates strings, sets lengths
+ clean_data(data, 1);
+
data->output_type = output_type;
}
/* for debugging only, prints out the state of the data struct
*/
static void
-print_state(parseData *data)
+print_state(revisionData *data)
{
printf("element = %i\n", data->element);
printf("output_type = %i\n", data->output_type);
- printf("title = %s\n", data->rev.title);
- printf("articleid = %s\n", data->rev.articleid);
- printf("revid = %s\n", data->rev.revid);
- printf("date = %s\n", data->rev.date);
- printf("time = %s\n", data->rev.time);
- printf("anon = %s\n", data->rev.anon);
- printf("editor = %s\n", data->rev.editor);
- printf("editorid = %s\n", data->rev.editorid);
- printf("minor = %s\n", data->rev.minor);
- printf("comment = %s\n", data->rev.comment);
- printf("text = %s\n", data->rev.text);
+ printf("title = %s\n", data->title);
+ printf("articleid = %s\n", data->articleid);
+ printf("revid = %s\n", data->revid);
+ printf("date = %s\n", data->date);
+ printf("time = %s\n", data->time);
+ printf("anon = %s\n", data->anon);
+ printf("editor = %s\n", data->editor);
+ printf("editorid = %s\n", data->editorid);
+ printf("minor = %s\n", (data->minor ? "1" : "0"));
+ printf("comment = %s\n", data->comment);
+ printf("text = %s\n", data->text);
printf("\n");
}
* it is called right before cleanup_revision() and cleanup_article()
*/
static void
-write_row(parseData *data)
+write_row(revisionData *data)
{
- // define temporary variables to hold output values:
- char *title, *articleid;
- char *revid, *date, *time, *anon, *editor, *editorid;
- char *minor, *comment;
- char *text;
- // perform some simple logic to obtain correct output values
-
- if (data->rev.minor == NULL)
- minor = "0";
- else minor = data->rev.minor;
-
- if (data->rev.editor == NULL)
- anon = "1";
- else anon = "0";
-
- if (data->rev.title == NULL)
- title = "";
- else title = data->rev.title;
-
- if (data->rev.articleid == NULL)
- articleid = "";
- else articleid = data->rev.articleid;
-
- if (data->rev.revid == NULL)
- revid = "";
- else revid = data->rev.revid;
-
- if (data->rev.date == NULL)
- date = "";
- else date = data->rev.date;
-
- if (data->rev.time == NULL)
- time = "";
- else time = data->rev.time;
-
- if (data->rev.editor == NULL)
- editor = "";
- else editor = data->rev.editor;
-
- if (data->rev.editorid == NULL)
- editorid = "";
- else editorid = data->rev.editorid;
-
- if (data->rev.text == NULL)
- text = "";
- else text = data->rev.text;
-
-
- if (data->rev.comment == NULL)
- comment = "";
- else comment = data->rev.comment;
-
// TODO: make it so you can specify fields to output
// note that date and time are separated by a space, to match postgres's
// timestamp format
+ printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
+ data->title,
+ data->articleid,
+ data->revid,
+ data->date,
+ data->time,
+ (data->editor[0] != '\0') ? "0" : "1", // anon?
+ data->editor,
+ data->editorid,
+ (data->minor) ? "1" : "0");
switch (data->output_type)
{
- case NORMAL:
- printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n",
- title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text);
- break;
case SIMPLE:
- printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n",
- title,articleid,revid,date,time,anon,editor,editorid,minor);
+ printf("\t%i\t%f\n", (unsigned int) strlen(data->text), shannon_H(data->text, data->text_size));
+ //printf("\n");
+ break;
+ case FULL:
+ printf("\t%s\t%s\n", data->comment, data->text);
break;
}
}
-static char
-*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
-{
- char *timeinstamp = ×tamp[DATE_LENGTH+1];
- strncpy(time_buffer, timeinstamp, TIME_LENGTH);
- time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
-}
-
-
-static char
-*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
-{
- strncpy(date_buffer, timestamp, DATE_LENGTH);
- date_buffer[DATE_LENGTH] = '\0';
-}
-
-char
-*append(char *entry, char *new)
-{
- char *newbuff;
- int len;
- len = (strlen(entry)+strlen(new))*sizeof(char) + 1;
- newbuff = realloc(entry, len);
- strcat(newbuff, new);
- return newbuff;
-}
-
-char
-*cache(char *entry, char *new)
-{
- char *newbuff;
- int len;
- len = strlen(new)*sizeof(char) + 1; // include space for the '\0' !
- newbuff = malloc(len);
- strcpy(newbuff,new);
- return newbuff;
-
-}
-
-char
-*store(char *entry, char *new)
-{
- char *newbuff;
- if (entry == NULL)
- newbuff = cache(entry, new);
- else
- newbuff = append(entry, new);
- return newbuff;
-}
-
void
-split_timestamp(parseData *data)
+split_timestamp(revisionData *data)
{
- char *t = data->rev.timestamp;
- char date_buffer[DATE_LENGTH+1];
- char time_buffer[TIME_LENGTH+1];
- datestr(t, date_buffer);
- timestr(t, time_buffer);
- data->rev.date = store(data->rev.date, date_buffer);
- data->rev.time = store(data->rev.time, time_buffer);
+ char *t = data->timestamp;
+ strncpy(data->date, data->timestamp, DATE_LENGTH);
+ char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
+ strncpy(data->time, timeinstamp, TIME_LENGTH);
}
-/* currently unused */
-static int
-is_whitespace(char *string) {
- int len = strlen(string);
- while (isspace(string[0]) && strlen(string) > 0) {
- string++;
- }
- if (strcmp(string, "") == 0)
- return 1;
- else
- return 0;
-}
+// like strncat but with previously known length
+char*
+strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
+{
+ //size_t dest_len = strlen(dest);
+ size_t i;
-static void
-squeeze(char *s, int c) {
- int i, j;
- for (i = j = 0; s[i] != '\0'; i++)
- if (s[i] != c)
- s[j++] = s[i];
- s[j] = '\0';
-}
+ for (i = 0 ; i < n && src[i] != '\0' ; i++)
+ dest[dest_len + i] = src[i];
+ dest[dest_len + i] = '\0';
-int
-contains(char *s, char *t)
-{
- char c = t[0]; //just get the first character of t
- int i = 0;
- while (s[i] != '\0') {
- if (s[i] == c)
- return 1;
- i++;
- }
+ return dest;
}
static void
-charhndl(parseData *data, char *s, int len)
+charhndl(void* vdata, const XML_Char* s, int len)
{
+ revisionData* data = (revisionData*) vdata;
if (data->element != UNUSED && data->position != SKIP) {
- char t[len];
- strncpy(t,s,len);
- t[len] = '\0'; // makes t a well-formed string
+ //char t[len];
+ //strncpy(t,s,len);
+ //t[len] = '\0'; // makes t a well-formed string
switch (data->element) {
+ case TEXT:
+ // printf("buffer length = %i, text: %s\n", len, t);
+ strlcatn(data->text, s, data->text_size, len);
+ data->text_size += len;
+ break;
+ case COMMENT:
+ strlcatn(data->comment, s, data->comment_size, len);
+ data->comment_size += len;
+ break;
case TITLE:
- {
- data->rev.title = store(data->rev.title, t);
- // skip any articles with bad characters in their titles
- if (contains(t, data->dropstr)) {
- data->position = SKIP;
- //printf("found a baddie\n");
- }
+ strlcatn(data->title, s, data->title_size, len);
+ data->title_size += len;
break;
- }
case ARTICLEID:
// printf("articleid = %s\n", t);
- data->rev.articleid = store(data->rev.articleid, t);
+ strlcatn(data->articleid, s, data->articleid_size, len);
+ data->articleid_size += len;
break;
case REVID:
// printf("revid = %s\n", t);
- data->rev.revid = store(data->rev.revid, t);
+ strlcatn(data->revid, s, data->revid_size, len);
+ data->revid_size += len;
break;
case TIMESTAMP:
- data->rev.timestamp = store(data->rev.timestamp, t);
- if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
+ strlcatn(data->timestamp, s, data->timestamp_size, len);
+ data->timestamp_size += len;
+ if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
split_timestamp(data);
break;
- case EDITOR: {
- data->rev.editor = store(data->rev.editor, t);
+ case EDITOR:
+ strlcatn(data->editor, s, data->editor_size, len);
+ data->editor_size += len;
break;
- }
case EDITORID:
//printf("editorid = %s\n", t);
- data->rev.editorid = store(data->rev.editorid, t);
+ strlcatn(data->editorid, s, data->editorid_size, len);
+ data->editorid_size += len;
break;
/* the following are implied or skipped:
case MINOR:
break; minor tag is just a tag
case UNUSED:
*/
- case COMMENT:
- // printf("row: comment is %s\n", t);
- data->rev.comment = store(data->rev.comment, t);
- break;
- case TEXT:
- data->rev.text = store(data->rev.text, t);
- break;
default: break;
}
}
}
static void
-start(parseData *data, const char *name, const char **attr)
+start(void* vdata, const XML_Char* name, const XML_Char** attr)
{
+ revisionData* data = (revisionData*) vdata;
if (strcmp(name,"title") == 0) {
cleanup_article(data); // cleans up data from last article
// minor tag has no character data, so we parse here
else if (strcmp(name,"minor") == 0) {
data->element = MINOR;
- data->rev.minor = store(data->rev.minor, "1");
+ data->minor = true;
}
else if (strcmp(name,"timestamp") == 0)
data->element = TIMESTAMP;
static void
-end(parseData *data, const char *name)
+end(void* vdata, const XML_Char* name)
{
+ revisionData* data = (revisionData*) vdata;
if (strcmp(name, "revision") == 0 && data->position != SKIP) {
write_row(data); // crucial... :)
cleanup_revision(data); // also crucial
fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
fprintf(stderr, "\n");
- fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
+ fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n");
fprintf(stderr, "\n");
fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
}
main(int argc, char *argv[])
{
- char *dropstr = "";
enum outtype output_type;
int dry_run = 0;
// in "simple" output, we don't print text and comments
output_type = SIMPLE;
char c;
- while ((c = getopt(argc, argv, "hr:sd")) != -1)
+ while ((c = getopt(argc, argv, "ht")) != -1)
switch (c)
{
- case 'r':
- dropstr = optarg;
- break;
case 'd':
dry_run = 1;
break;
case 't':
- output_type = NORMAL;
+ output_type = FULL;
break;
case 'h':
print_usage(argv);
if (dry_run) { // lets us print initialization options
printf("simple_output = %i\n", output_type);
- printf("dropstr = %s\n", dropstr);
exit(1);
}
// create a new instance of the expat parser
- XML_Parser parser = XML_ParserCreate(NULL);
+ XML_Parser parser = XML_ParserCreate("UTF-8");
// initialize the user data struct which is passed to callback functions
- parseData data;
+ revisionData data;
// initialize the elements of the struct to default values
- init_data(&data, dropstr, output_type);
+ init_data(&data, output_type);
// makes the parser pass "data" as the first argument to every callback
XML_SetUserData(parser, &data);
+ void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
+ void (*endFnPtr)(void*, const XML_Char*) = end;
+ void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
+
// sets start and end to be the element start and end handlers
- XML_SetElementHandler(parser, (void *) start, (void *) end);
- // sets charhndl to be the callback for raw character data
- XML_SetCharacterDataHandler(parser, (void *) charhndl);
+ XML_SetElementHandler(parser, startFnPtr, endFnPtr);
+ // sets charhndl to be the callback for character data
+ XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
- int done;
+ bool done;
char buf[BUFSIZ];
- write_header();
-
// shovel data into the parser
do {
// read into buf a bufferfull of data from standard input
- size_t len = fread(buf, 1, sizeof(buf), stdin);
- done = len < sizeof(buf); // checks if we've got the last bufferfull
+ size_t len = fread(buf, 1, BUFSIZ, stdin);
+ done = len < BUFSIZ; // checks if we've got the last bufferfull
// passes the buffer of data to the parser and checks for error
// (this is where the callbacks are invoked)