Previously, when appending character data to long text fields, each call
to charhndl required a call to strlen on the text field to which the new
character data was to be appended. This is O(N^2)! The problem was
most severe for vandalized articles with inlined HTML, but it ultimately
affected all data parsing as expat calls the charhndl function every
time it resolves a default XML entity.
By tracking the length of each field in our revisionData structure and
using a custom strncat-type function, it's possible to avoid this
overhead. Now, we are pipe-buffer-bound when processing a 7z-compressed
mediawiki dump. The current simple english wiki dump takes about 3
minutes to process on my 2x2.4ghz laptop, even when handling all text
data. Just decompressing it to /dev/null takes around 1 minute.
#include "expat.h"
#include <getopt.h>
#include "expat.h"
#include <getopt.h>
// timestamp of the form 2003-11-07T00:43:23Z
#define DATE_LENGTH 10
#define TIME_LENGTH 8
#define TIMESTAMP_LENGTH 20
// timestamp of the form 2003-11-07T00:43:23Z
#define DATE_LENGTH 10
#define TIME_LENGTH 8
#define TIMESTAMP_LENGTH 20
-// 2048 KB in bytes + 1
-#define TEXT_BUFFER_SIZE 2097153
+#define MEGABYTE 1048576
#define FIELD_BUFFER_SIZE 1024
#define FIELD_BUFFER_SIZE 1024
+// 2048 KB in bytes + 1
+//#define TEXT_BUFFER_SIZE 2097153
+//#define TEXT_BUFFER_SIZE 10485760
enum elements {
TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
enum elements {
TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
+ // pointers to once-allocated buffers
char *title;
char *articleid;
char *revid;
char *title;
char *articleid;
char *revid;
char *anon;
char *editor;
char *editorid;
char *anon;
char *editor;
char *editorid;
- char text[TEXT_BUFFER_SIZE];
+ char *text;
+
+ // track string size of the elements, to prevent O(N^2) processing in charhndl
+ // when we have to take strlen for every character which we append to the buffer
+ size_t title_size;
+ size_t articleid_size;
+ size_t revid_size;
+ size_t date_size;
+ size_t time_size;
+ size_t timestamp_size;
+ size_t anon_size;
+ size_t editor_size;
+ size_t editorid_size;
+ size_t comment_size;
+ size_t text_size;
+
+ bool minor;
enum elements element;
enum block position;
enum elements element;
enum block position;
static void
clean_data(revisionData *data, int title)
{
static void
clean_data(revisionData *data, int title)
{
+ // reset title (if we are switching articles)
- data->title = NULL;
- data->articleid = NULL;
+ data->title[0] = '\0';
+ data->articleid[0] = '\0';
+ data->title_size = 0;
+ data->articleid_size = 0;
- data->revid = NULL;
- data->date = NULL;
- data->time = NULL;
- data->timestamp = NULL;
- data->anon = NULL;
- data->editor = NULL;
- data->editorid = NULL;
+
+ // reset text fields
+ data->revid[0] = '\0';
+ data->date[0] = '\0';
+ data->time[0] = '\0';
+ data->timestamp[0] = '\0';
+ data->anon[0] = '\0';
+ data->editor[0] = '\0';
+ data->editorid[0] = '\0';
+ data->comment[0] = '\0';
+ data->text[0] = '\0';
+
+ // reset length tracking
+ data->revid_size = 0;
+ data->date_size = 0;
+ data->time_size = 0;
+ data->timestamp_size = 0;
+ data->anon_size = 0;
+ data->editor_size = 0;
+ data->editorid_size = 0;
+ data->comment_size = 0;
+ data->text_size = 0;
+
+ // reset flags and element type info
- data->comment = NULL;
- //data->text = NULL;
static void
free_data(revisionData *data, int title)
{
static void
free_data(revisionData *data, int title)
{
free(data->editor);
free(data->editorid);
free(data->comment);
free(data->editor);
free(data->editorid);
free(data->comment);
- //free(data->text);
- data->text[0] = '\0';
}
void cleanup_revision(revisionData *data) {
}
void cleanup_revision(revisionData *data) {
clean_data(data, 0);
}
void cleanup_article(revisionData *data) {
clean_data(data, 0);
}
void cleanup_article(revisionData *data) {
static void
init_data(revisionData *data, outtype output_type)
{
static void
init_data(revisionData *data, outtype output_type)
{
- clean_data(data, 1); // sets every element to null...
+ data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'?
+ data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->title = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->date = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->time = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
+ data->minor = false;
+
+ // resets the data fields, null terminates strings, sets lengths
+ clean_data(data, 1);
+
data->output_type = output_type;
}
data->output_type = output_type;
}
// note that date and time are separated by a space, to match postgres's
// timestamp format
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
// note that date and time are separated by a space, to match postgres's
// timestamp format
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
- (data->title != NULL) ? data->title : "",
- (data->articleid != NULL) ? data->articleid : "",
- (data->revid != NULL) ? data->revid : "",
- (data->date != NULL) ? data->date : "",
- (data->time != NULL) ? data->time : "",
- (data->editor != NULL) ? "0" : "1",
- (data->editor != NULL) ? data->editor : "",
- (data->editorid != NULL) ? data->editorid : "",
+ data->title,
+ data->articleid,
+ data->revid,
+ data->date,
+ data->time,
+ (data->editor[0] != '\0') ? "0" : "1", // anon?
+ data->editor,
+ data->editorid,
(data->minor) ? "1" : "0");
switch (data->output_type)
{
case SIMPLE:
printf("\t%i\n", (unsigned int) strlen(data->text));
(data->minor) ? "1" : "0");
switch (data->output_type)
{
case SIMPLE:
printf("\t%i\n", (unsigned int) strlen(data->text));
break;
case FULL:
printf("\t%s\t%s\n", data->comment, data->text);
break;
case FULL:
printf("\t%s\t%s\n", data->comment, data->text);
-void
-*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
-{
- char *timeinstamp = ×tamp[DATE_LENGTH+1];
- strncpy(time_buffer, timeinstamp, TIME_LENGTH);
- time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
-}
-
-
-void
-*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
-{
- strncpy(date_buffer, timestamp, DATE_LENGTH);
- date_buffer[DATE_LENGTH] = '\0';
-}
-
char
*append(char *entry, char *newstr)
{
char
*append(char *entry, char *newstr)
{
split_timestamp(revisionData *data)
{
char *t = data->timestamp;
split_timestamp(revisionData *data)
{
char *t = data->timestamp;
- char date_buffer[DATE_LENGTH+1];
- char time_buffer[TIME_LENGTH+1];
- datestr(t, date_buffer);
- timestr(t, time_buffer);
- data->date = store(data->date, date_buffer);
- data->time = store(data->time, time_buffer);
+ strncpy(data->date, data->timestamp, DATE_LENGTH);
+ char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
+ strncpy(data->time, timeinstamp, TIME_LENGTH);
+// like strncat but with previously known length
+char*
+strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
+{
+ //size_t dest_len = strlen(dest);
+ size_t i;
+
+ for (i = 0 ; i < n && src[i] != '\0' ; i++)
+ dest[dest_len + i] = src[i];
+ dest[dest_len + i] = '\0';
+
+ return dest;
+}
+
static void
charhndl(void* vdata, const XML_Char* s, int len)
{
revisionData* data = (revisionData*) vdata;
if (data->element != UNUSED && data->position != SKIP) {
static void
charhndl(void* vdata, const XML_Char* s, int len)
{
revisionData* data = (revisionData*) vdata;
if (data->element != UNUSED && data->position != SKIP) {
- char t[len];
- strncpy(t,s,len);
- t[len] = '\0'; // makes t a well-formed string
+ //char t[len];
+ //strncpy(t,s,len);
+ //t[len] = '\0'; // makes t a well-formed string
+ case TEXT:
+ // printf("buffer length = %i, text: %s\n", len, t);
+ strlcatn(data->text, s, data->text_size, len);
+ data->text_size += len;
+ break;
+ case COMMENT:
+ strlcatn(data->comment, s, data->comment_size, len);
+ data->comment_size += len;
+ break;
- {
- data->title = store(data->title, t);
- // skip any articles with bad characters in their titles
+ strlcatn(data->title, s, data->title_size, len);
+ data->title_size += len;
case ARTICLEID:
// printf("articleid = %s\n", t);
case ARTICLEID:
// printf("articleid = %s\n", t);
- data->articleid = store(data->articleid, t);
+ strlcatn(data->articleid, s, data->articleid_size, len);
+ data->articleid_size += len;
break;
case REVID:
// printf("revid = %s\n", t);
break;
case REVID:
// printf("revid = %s\n", t);
- data->revid = store(data->revid, t);
+ strlcatn(data->revid, s, data->revid_size, len);
+ data->revid_size += len;
- data->timestamp = store(data->timestamp, t);
+ strlcatn(data->timestamp, s, data->timestamp_size, len);
+ data->timestamp_size += len;
if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
split_timestamp(data);
break;
if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
split_timestamp(data);
break;
- case EDITOR: {
- data->editor = store(data->editor, t);
+ case EDITOR:
+ strlcatn(data->editor, s, data->editor_size, len);
+ data->editor_size += len;
case EDITORID:
//printf("editorid = %s\n", t);
case EDITORID:
//printf("editorid = %s\n", t);
- data->editorid = store(data->editorid, t);
+ strlcatn(data->editorid, s, data->editorid_size, len);
+ data->editorid_size += len;
break;
/* the following are implied or skipped:
case MINOR:
break;
/* the following are implied or skipped:
case MINOR:
break; minor tag is just a tag
case UNUSED:
*/
break; minor tag is just a tag
case UNUSED:
*/
- case COMMENT:
- // printf("row: comment is %s\n", t);
- //if (data->output_type == FULL) {
- data->comment = store(data->comment, t);
- //}
- break;
- case TEXT:
- //if (data->output_type == FULL) {
- //data->text = store(data->text, t);
- //
- strcat(data->text, t);
- //}
- break;
fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
fprintf(stderr, "\n");
fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
fprintf(stderr, "\n");
- fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
+ fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n");
fprintf(stderr, "\n");
fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
}
fprintf(stderr, "\n");
fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
}
}
// create a new instance of the expat parser
}
// create a new instance of the expat parser
- XML_Parser parser = XML_ParserCreate(NULL);
+ XML_Parser parser = XML_ParserCreate("UTF-8");
// initialize the user data struct which is passed to callback functions
revisionData data;
// initialize the user data struct which is passed to callback functions
revisionData data;
// sets start and end to be the element start and end handlers
XML_SetElementHandler(parser, startFnPtr, endFnPtr);
// sets start and end to be the element start and end handlers
XML_SetElementHandler(parser, startFnPtr, endFnPtr);
- // sets charhndl to be the callback for raw character data
+ // sets charhndl to be the callback for character data
XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
// shovel data into the parser
do {
// read into buf a bufferfull of data from standard input
// shovel data into the parser
do {
// read into buf a bufferfull of data from standard input
- size_t len = fread(buf, 1, sizeof(buf), stdin);
- done = len < sizeof(buf); // checks if we've got the last bufferfull
+ size_t len = fread(buf, 1, BUFSIZ, stdin);
+ done = len < BUFSIZ; // checks if we've got the last bufferfull
// passes the buffer of data to the parser and checks for error
// (this is where the callbacks are invoked)
// passes the buffer of data to the parser and checks for error
// (this is where the callbacks are invoked)