From: Benjamin Mako Hill Date: Sat, 12 Apr 2014 02:56:45 +0000 (-0700) Subject: remove shannon entropy as something computed X-Git-Url: https://projects.mako.cc/source/wikiq/commitdiff_plain/fd20cb1bc83a8062f951fc43ef7dd16d4da1ab04 remove shannon entropy as something computed It's really just nothing something most people will use it and it's somewhat expensive to compute. --- diff --git a/Makefile b/Makefile index cc0e56f..b7df471 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,12 @@ CXXFLAGS = -O3 CFLAGS = $(CXXFLAGS) -OBJECTS = wikiq.o md5.o disorder.o +OBJECTS = wikiq.o md5.o all: wikiq wikiq: $(OBJECTS) $(CXX) $(CXXFLAGS) $(OBJECTS) -lpcrecpp -lpcre -lexpat -o wikiq -disorder.o: disorder.h md5.o: md5.h clean: diff --git a/disorder.c b/disorder.c deleted file mode 100644 index a5f7c35..0000000 --- a/disorder.c +++ /dev/null @@ -1,192 +0,0 @@ -/*************************************************************************** - * libdisorder: A Library for Measuring Byte Stream Entropy - * Copyright (C) 2010 Michael E. Locasto - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the: - * Free Software Foundation, Inc. - * 59 Temple Place, Suite 330 - * Boston, MA 02111-1307 USA - * - * $Id$ - **************************************************************************/ - -#include //for log2() -#include //for NULL -#include "disorder.h" - -#if defined(__FreeBSD__) -#define log2(x) (log((x)) * (1./M_LN2)) -#endif - -/** Frequecies for each byte */ -static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample -static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing) -static int m_num_tokens = 0; //actual number of `seen' tokens, max 256 -static float m_maxent = 0.0; -static float m_ratio = 0.0; -static int LIBDISORDER_INITIALIZED = 0; - -static void -initialize_lib() -{ - int i = 0; - if(1==LIBDISORDER_INITIALIZED) - return; - - m_num_tokens = 0; - - for(i=0;iLIBDO_MAX_BYTES) - { - //report error somehow? - return 0.0; - } - - //iterate through whole m_token_freq array, but only count - //spots that have a registered token (i.e., freq>0) - for(i=0;i #include "expat.h" #include -#include "disorder.h" #include "md5.h" #include "dtl/dtl.hpp" #include @@ -345,7 +344,6 @@ write_row(revisionData *data) << data->editorid << "\t" << ((data->minor) ? "TRUE" : "FALSE") << "\t" << (unsigned int) data->text_size << "\t" - << shannon_H(data->text, data->text_size) << "\t" << md5_hex_output << "\t" << reverted_to << "\t" << (int) additions.size() << "\t" @@ -540,7 +538,7 @@ void print_usage(char* argv[]) { << "a tab-separated stream of revisions on standard out:" << endl << endl << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl - << "text_length, text_entropy, text_md5, reversion, additions_size, deletions_size" << endl + << "text_length, text_md5, reversion, additions_size, deletions_size" << endl << ".... and additional fields for each regex executed against add/delete diffs" << endl << endl << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl @@ -642,7 +640,6 @@ main(int argc, char *argv[]) << "editor_id" << "\t" << "minor" << "\t" << "text_size" << "\t" - << "text_entropy" << "\t" << "text_md5" << "\t" << "reversion" << "\t" << "additions_size" << "\t"