From: Erik Garrison Date: Sun, 24 Oct 2010 20:28:15 +0000 (-0400) Subject: added shannon_H entropy metric for each revision X-Git-Url: https://projects.mako.cc/source/wikiq/commitdiff_plain/ae96c7a9d99998ad0be4bb8aa382b92df32533a5 added shannon_H entropy metric for each revision --- diff --git a/Makefile b/Makefile index bf436d0..fc80d1a 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,17 @@ CXX = g++ CFLAGS = -O3 +OBJECTS = disorder.o all: wikiq -wikiq: wikiq.c - $(CXX) $(CFLAGS) wikiq.c -o wikiq -lexpat +wikiq: wikiq.c $(OBJECTS) + $(CXX) $(CFLAGS) wikiq.c $(OBJECTS) -o wikiq -lexpat + +disorder.o: disorder.c + $(CXX) $(CFLAGS) -c disorder.c clean: - rm -f wikiq + rm -f wikiq $(OBJECTS) gprof: $(MAKE) CFLAGS=-pg wikiq diff --git a/disorder.c b/disorder.c new file mode 100644 index 0000000..a5f7c35 --- /dev/null +++ b/disorder.c @@ -0,0 +1,192 @@ +/*************************************************************************** + * libdisorder: A Library for Measuring Byte Stream Entropy + * Copyright (C) 2010 Michael E. Locasto + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the: + * Free Software Foundation, Inc. + * 59 Temple Place, Suite 330 + * Boston, MA 02111-1307 USA + * + * $Id$ + **************************************************************************/ + +#include //for log2() +#include //for NULL +#include "disorder.h" + +#if defined(__FreeBSD__) +#define log2(x) (log((x)) * (1./M_LN2)) +#endif + +/** Frequecies for each byte */ +static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample +static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing) +static int m_num_tokens = 0; //actual number of `seen' tokens, max 256 +static float m_maxent = 0.0; +static float m_ratio = 0.0; +static int LIBDISORDER_INITIALIZED = 0; + +static void +initialize_lib() +{ + int i = 0; + if(1==LIBDISORDER_INITIALIZED) + return; + + m_num_tokens = 0; + + for(i=0;iLIBDO_MAX_BYTES) + { + //report error somehow? + return 0.0; + } + + //iterate through whole m_token_freq array, but only count + //spots that have a registered token (i.e., freq>0) + for(i=0;i #include "expat.h" #include +#include "disorder.h" // timestamp of the form 2003-11-07T00:43:23Z #define DATE_LENGTH 10 @@ -225,7 +226,7 @@ write_row(revisionData *data) switch (data->output_type) { case SIMPLE: - printf("\t%i\n", (unsigned int) strlen(data->text)); + printf("\t%i\t%f\n", (unsigned int) strlen(data->text), shannon_H(data->text, data->text_size)); //printf("\n"); break; case FULL: @@ -235,40 +236,6 @@ write_row(revisionData *data) } -char -*append(char *entry, char *newstr) -{ - char *newbuff; - int len; - len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1; - newbuff = (char*) realloc(entry, len); - strcat(newbuff, newstr); - return newbuff; -} - -char -*cache(char *entry, char *newstr) -{ - char *newbuff; - int len; - len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' ! - newbuff = (char*) malloc(len); - strcpy(newbuff,newstr); - return newbuff; - -} - -char -*store(char *entry, char *newstr) -{ - char *newbuff; - if (entry == NULL) - newbuff = cache(entry, newstr); - else - newbuff = append(entry, newstr); - return newbuff; -} - void split_timestamp(revisionData *data) { @@ -278,19 +245,6 @@ split_timestamp(revisionData *data) strncpy(data->time, timeinstamp, TIME_LENGTH); } -/* currently unused */ -static int -is_whitespace(char *string) { - int len = strlen(string); - while (isspace(string[0]) && strlen(string) > 0) { - string++; - } - if (strcmp(string, "") == 0) - return 1; - else - return 0; -} - // like strncat but with previously known length char* strlcatn(char *dest, const char *src, size_t dest_len, size_t n)