#define TIME_LENGTH 8
#define TIMESTAMP_LENGTH 20
+// 2048 KB in bytes + 1
+#define TEXT_BUFFER_SIZE 2097153
+#define FIELD_BUFFER_SIZE 1024
+
enum elements {
TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
-enum outtype { NORMAL, SIMPLE };
+enum outtype { FULL, SIMPLE };
typedef struct {
- struct {
- char *title;
- char *articleid;
- char *revid;
- char *date;
- char *time;
- char *timestamp;
- char *anon;
- char *editor;
- char *editorid;
- char *minor;
- char *comment;
- char *text;
- } rev;
+ char *title;
+ char *articleid;
+ char *revid;
+ char *date;
+ char *time;
+ char *timestamp;
+ char *anon;
+ char *editor;
+ char *editorid;
+ bool minor;
+ char *comment;
+ char text[TEXT_BUFFER_SIZE];
- char *dropstr;
enum elements element;
enum block position;
enum outtype output_type;
-} parseData;
+} revisionData;
/* free_data and clean_data
* Also, frees memory dynamically allocated to store data.
*/
static void
-clean_data(parseData *data, int title)
+clean_data(revisionData *data, int title)
{
if (title) {
- data->rev.title = NULL;
- data->rev.articleid = NULL;
+ data->title = NULL;
+ data->articleid = NULL;
}
- data->rev.revid = NULL;
- data->rev.date = NULL;
- data->rev.time = NULL;
- data->rev.timestamp = NULL;
- data->rev.anon = NULL;
- data->rev.editor = NULL;
- data->rev.editorid = NULL;
- data->rev.minor = NULL;
- data->rev.comment = NULL;
- data->rev.text = NULL;
+ data->revid = NULL;
+ data->date = NULL;
+ data->time = NULL;
+ data->timestamp = NULL;
+ data->anon = NULL;
+ data->editor = NULL;
+ data->editorid = NULL;
+ data->minor = false;
+ data->comment = NULL;
+ //data->text = NULL;
data->element = UNUSED;
//data->position =
}
static void
-free_data(parseData *data, int title)
+free_data(revisionData *data, int title)
{
if (title) {
//printf("freeing article\n");
- free(data->rev.title);
- free(data->rev.articleid);
+ free(data->title);
+ free(data->articleid);
}
- free(data->rev.revid);
- free(data->rev.date);
- free(data->rev.time);
- free(data->rev.timestamp);
- free(data->rev.anon);
- free(data->rev.editor);
- free(data->rev.editorid);
- free(data->rev.minor);
- free(data->rev.comment);
- free(data->rev.text);
+ free(data->revid);
+ free(data->date);
+ free(data->time);
+ free(data->timestamp);
+ free(data->anon);
+ free(data->editor);
+ free(data->editorid);
+ free(data->comment);
+ //free(data->text);
+ data->text[0] = '\0';
}
-cleanup_revision(parseData *data) {
+void cleanup_revision(revisionData *data) {
free_data(data, 0);
clean_data(data, 0);
}
-cleanup_article(parseData *data) {
+void cleanup_article(revisionData *data) {
free_data(data, 1);
clean_data(data, 1);
}
static void
-init_data(parseData *data, char *dropstr, int output_type)
+init_data(revisionData *data, outtype output_type)
{
clean_data(data, 1); // sets every element to null...
- data->dropstr = dropstr;
data->output_type = output_type;
}
/* for debugging only, prints out the state of the data struct
*/
static void
-print_state(parseData *data)
+print_state(revisionData *data)
{
printf("element = %i\n", data->element);
printf("output_type = %i\n", data->output_type);
- printf("title = %s\n", data->rev.title);
- printf("articleid = %s\n", data->rev.articleid);
- printf("revid = %s\n", data->rev.revid);
- printf("date = %s\n", data->rev.date);
- printf("time = %s\n", data->rev.time);
- printf("anon = %s\n", data->rev.anon);
- printf("editor = %s\n", data->rev.editor);
- printf("editorid = %s\n", data->rev.editorid);
- printf("minor = %s\n", data->rev.minor);
- printf("comment = %s\n", data->rev.comment);
- printf("text = %s\n", data->rev.text);
+ printf("title = %s\n", data->title);
+ printf("articleid = %s\n", data->articleid);
+ printf("revid = %s\n", data->revid);
+ printf("date = %s\n", data->date);
+ printf("time = %s\n", data->time);
+ printf("anon = %s\n", data->anon);
+ printf("editor = %s\n", data->editor);
+ printf("editorid = %s\n", data->editorid);
+ printf("minor = %s\n", (data->minor ? "1" : "0"));
+ printf("comment = %s\n", data->comment);
+ printf("text = %s\n", data->text);
printf("\n");
}
* it is called right before cleanup_revision() and cleanup_article()
*/
static void
-write_row(parseData *data)
+write_row(revisionData *data)
{
- // define temporary variables to hold output values:
- char *title, *articleid;
- char *revid, *date, *time, *anon, *editor, *editorid;
- char *minor, *comment;
- char *text;
- // perform some simple logic to obtain correct output values
-
- if (data->rev.minor == NULL)
- minor = "0";
- else minor = data->rev.minor;
-
- if (data->rev.editor == NULL)
- anon = "1";
- else anon = "0";
-
- if (data->rev.title == NULL)
- title = "";
- else title = data->rev.title;
-
- if (data->rev.articleid == NULL)
- articleid = "";
- else articleid = data->rev.articleid;
-
- if (data->rev.revid == NULL)
- revid = "";
- else revid = data->rev.revid;
-
- if (data->rev.date == NULL)
- date = "";
- else date = data->rev.date;
-
- if (data->rev.time == NULL)
- time = "";
- else time = data->rev.time;
-
- if (data->rev.editor == NULL)
- editor = "";
- else editor = data->rev.editor;
-
- if (data->rev.editorid == NULL)
- editorid = "";
- else editorid = data->rev.editorid;
-
- if (data->rev.text == NULL)
- text = "";
- else text = data->rev.text;
-
-
- if (data->rev.comment == NULL)
- comment = "";
- else comment = data->rev.comment;
-
// TODO: make it so you can specify fields to output
// note that date and time are separated by a space, to match postgres's
// timestamp format
+ printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
+ (data->title != NULL) ? data->title : "",
+ (data->articleid != NULL) ? data->articleid : "",
+ (data->revid != NULL) ? data->revid : "",
+ (data->date != NULL) ? data->date : "",
+ (data->time != NULL) ? data->time : "",
+ (data->editor != NULL) ? "0" : "1",
+ (data->editor != NULL) ? data->editor : "",
+ (data->editorid != NULL) ? data->editorid : "",
+ (data->minor) ? "1" : "0");
switch (data->output_type)
{
- case NORMAL:
- printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n",
- title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text);
- break;
case SIMPLE:
- printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n",
- title,articleid,revid,date,time,anon,editor,editorid,minor);
+ printf("\t%i\n", (unsigned int) strlen(data->text));
+ break;
+ case FULL:
+ printf("\t%s\t%s\n", data->comment, data->text);
break;
}
}
-static char
+void
*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
{
char *timeinstamp = ×tamp[DATE_LENGTH+1];
}
-static char
+void
*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
{
strncpy(date_buffer, timestamp, DATE_LENGTH);
}
char
-*append(char *entry, char *new)
+*append(char *entry, char *newstr)
{
char *newbuff;
int len;
- len = (strlen(entry)+strlen(new))*sizeof(char) + 1;
- newbuff = realloc(entry, len);
- strcat(newbuff, new);
+ len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
+ newbuff = (char*) realloc(entry, len);
+ strcat(newbuff, newstr);
return newbuff;
}
char
-*cache(char *entry, char *new)
+*cache(char *entry, char *newstr)
{
char *newbuff;
int len;
- len = strlen(new)*sizeof(char) + 1; // include space for the '\0' !
- newbuff = malloc(len);
- strcpy(newbuff,new);
+ len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
+ newbuff = (char*) malloc(len);
+ strcpy(newbuff,newstr);
return newbuff;
}
char
-*store(char *entry, char *new)
+*store(char *entry, char *newstr)
{
char *newbuff;
if (entry == NULL)
- newbuff = cache(entry, new);
+ newbuff = cache(entry, newstr);
else
- newbuff = append(entry, new);
+ newbuff = append(entry, newstr);
return newbuff;
}
void
-split_timestamp(parseData *data)
+split_timestamp(revisionData *data)
{
- char *t = data->rev.timestamp;
+ char *t = data->timestamp;
char date_buffer[DATE_LENGTH+1];
char time_buffer[TIME_LENGTH+1];
datestr(t, date_buffer);
timestr(t, time_buffer);
- data->rev.date = store(data->rev.date, date_buffer);
- data->rev.time = store(data->rev.time, time_buffer);
+ data->date = store(data->date, date_buffer);
+ data->time = store(data->time, time_buffer);
}
/* currently unused */
}
static void
-squeeze(char *s, int c) {
- int i, j;
- for (i = j = 0; s[i] != '\0'; i++)
- if (s[i] != c)
- s[j++] = s[i];
- s[j] = '\0';
-}
-
-int
-contains(char *s, char *t)
-{
- char c = t[0]; //just get the first character of t
- int i = 0;
- while (s[i] != '\0') {
- if (s[i] == c)
- return 1;
- i++;
- }
-}
-
-static void
-charhndl(parseData *data, char *s, int len)
+charhndl(void* vdata, const XML_Char* s, int len)
{
+ revisionData* data = (revisionData*) vdata;
if (data->element != UNUSED && data->position != SKIP) {
char t[len];
strncpy(t,s,len);
switch (data->element) {
case TITLE:
{
- data->rev.title = store(data->rev.title, t);
+ data->title = store(data->title, t);
// skip any articles with bad characters in their titles
- if (contains(t, data->dropstr)) {
- data->position = SKIP;
- //printf("found a baddie\n");
- }
break;
}
case ARTICLEID:
// printf("articleid = %s\n", t);
- data->rev.articleid = store(data->rev.articleid, t);
+ data->articleid = store(data->articleid, t);
break;
case REVID:
// printf("revid = %s\n", t);
- data->rev.revid = store(data->rev.revid, t);
+ data->revid = store(data->revid, t);
break;
case TIMESTAMP:
- data->rev.timestamp = store(data->rev.timestamp, t);
- if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
+ data->timestamp = store(data->timestamp, t);
+ if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
split_timestamp(data);
break;
case EDITOR: {
- data->rev.editor = store(data->rev.editor, t);
+ data->editor = store(data->editor, t);
break;
}
case EDITORID:
//printf("editorid = %s\n", t);
- data->rev.editorid = store(data->rev.editorid, t);
+ data->editorid = store(data->editorid, t);
break;
/* the following are implied or skipped:
case MINOR:
*/
case COMMENT:
// printf("row: comment is %s\n", t);
- data->rev.comment = store(data->rev.comment, t);
+ //if (data->output_type == FULL) {
+ data->comment = store(data->comment, t);
+ //}
break;
case TEXT:
- data->rev.text = store(data->rev.text, t);
+ //if (data->output_type == FULL) {
+ //data->text = store(data->text, t);
+ //
+ strcat(data->text, t);
+ //}
break;
default: break;
}
}
static void
-start(parseData *data, const char *name, const char **attr)
+start(void* vdata, const XML_Char* name, const XML_Char** attr)
{
+ revisionData* data = (revisionData*) vdata;
if (strcmp(name,"title") == 0) {
cleanup_article(data); // cleans up data from last article
// minor tag has no character data, so we parse here
else if (strcmp(name,"minor") == 0) {
data->element = MINOR;
- data->rev.minor = store(data->rev.minor, "1");
+ data->minor = true;
}
else if (strcmp(name,"timestamp") == 0)
data->element = TIMESTAMP;
static void
-end(parseData *data, const char *name)
+end(void* vdata, const XML_Char* name)
{
+ revisionData* data = (revisionData*) vdata;
if (strcmp(name, "revision") == 0 && data->position != SKIP) {
write_row(data); // crucial... :)
cleanup_revision(data); // also crucial
main(int argc, char *argv[])
{
- char *dropstr = "";
enum outtype output_type;
int dry_run = 0;
// in "simple" output, we don't print text and comments
output_type = SIMPLE;
char c;
- while ((c = getopt(argc, argv, "hr:sd")) != -1)
+ while ((c = getopt(argc, argv, "ht")) != -1)
switch (c)
{
- case 'r':
- dropstr = optarg;
- break;
case 'd':
dry_run = 1;
break;
case 't':
- output_type = NORMAL;
+ output_type = FULL;
break;
case 'h':
print_usage(argv);
if (dry_run) { // lets us print initialization options
printf("simple_output = %i\n", output_type);
- printf("dropstr = %s\n", dropstr);
exit(1);
}
XML_Parser parser = XML_ParserCreate(NULL);
// initialize the user data struct which is passed to callback functions
- parseData data;
+ revisionData data;
// initialize the elements of the struct to default values
- init_data(&data, dropstr, output_type);
+ init_data(&data, output_type);
// makes the parser pass "data" as the first argument to every callback
XML_SetUserData(parser, &data);
+ void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
+ void (*endFnPtr)(void*, const XML_Char*) = end;
+ void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
+
// sets start and end to be the element start and end handlers
- XML_SetElementHandler(parser, (void *) start, (void *) end);
+ XML_SetElementHandler(parser, startFnPtr, endFnPtr);
// sets charhndl to be the callback for raw character data
- XML_SetCharacterDataHandler(parser, (void *) charhndl);
+ XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
int done;
char buf[BUFSIZ];