+++ /dev/null
-/***************************************************************************
- * libdisorder: A Library for Measuring Byte Stream Entropy
- * Copyright (C) 2010 Michael E. Locasto
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the:
- * Free Software Foundation, Inc.
- * 59 Temple Place, Suite 330
- * Boston, MA 02111-1307 USA
- *
- * $Id$
- **************************************************************************/
-
-#include <math.h> //for log2()
-#include <stdio.h> //for NULL
-#include "disorder.h"
-
-#if defined(__FreeBSD__)
-#define log2(x) (log((x)) * (1./M_LN2))
-#endif
-
-/** Frequecies for each byte */
-static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample
-static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing)
-static int m_num_tokens = 0; //actual number of `seen' tokens, max 256
-static float m_maxent = 0.0;
-static float m_ratio = 0.0;
-static int LIBDISORDER_INITIALIZED = 0;
-
-static void
-initialize_lib()
-{
- int i = 0;
- if(1==LIBDISORDER_INITIALIZED)
- return;
-
- m_num_tokens = 0;
-
- for(i=0;i<LIBDO_MAX_BYTES;i++)
- {
- m_token_freqs[i]=0;
- m_token_probs[i]=0.0;
- }
-
- LIBDISORDER_INITIALIZED = 1;
-}
-
-/**
- * Set m_num_tokens by iterating over m_token_freq[] and maintaining
- * a counter of each position that does not hold the value of zero.
- */
-static void
-count_num_tokens()
-{
- int i = 0;
- int counter = 0;
- for(i=0;i<LIBDO_MAX_BYTES;i++)
- {
- if(0!=m_token_freqs[i])
- {
- counter++;
- }
- }
- m_num_tokens = counter;
- return;
-}
-
-/**
- * Sum frequencies for each token (i.e., byte values 0 through 255)
- * We assume the `length' parameter is correct.
- *
- * This function is available only to functions in this file.
- */
-static void
-get_token_frequencies(char* buf,
- long long length)
-{
- int i=0;
- char* itr=NULL;
- unsigned char c=0;
-
- itr = buf;
-
- //reset number of tokens
- m_num_tokens = 0;
-
- //make sure freqency and probability arrays are cleared
- for(i=0;i<LIBDO_MAX_BYTES;i++)
- {
- m_token_freqs[i] = 0;
- m_token_probs[i] = 0.0;
- }
-
- for(i=0;i<length;i++)
- {
- c = (unsigned char)*itr;
- //assert(0<=c<LIBDO_MAX_BYTES);
- m_token_freqs[c]++;
- itr++;
- }
-}
-
-/**
- * Return entropy (in bits) of this buffer of bytes. We assume that the
- * `length' parameter is correct. This implementation is a translation
- * of the PHP code found here:
- *
- * http://onlamp.com/pub/a/php/2005/01/06/entropy.html
- *
- * with a helpful hint on the `foreach' statement from here:
- *
- * http://php.net/manual/en/control-structures.foreach.php
- */
-float
-shannon_H(char* buf,
- long long length)
-{
- int i = 0;
- float bits = 0.0;
- char* itr=NULL; //values of itr should be zero to 255
- unsigned char token;
- int num_events = 0; //`length' parameter
- float freq = 0.0; //loop variable for holding freq from m_token_freq[]
- float entropy = 0.0; //running entropy sum
-
- if(NULL==buf || 0==length)
- return 0.0;
-
- if(0==LIBDISORDER_INITIALIZED)
- initialize_lib();
-
- itr = buf;
- m_maxent = 0.0;
- m_ratio = 0.0;
- num_events = length;
- get_token_frequencies(itr, num_events); //modifies m_token_freqs[]
- //set m_num_tokens by counting unique m_token_freqs entries
- count_num_tokens();
-
- if(m_num_tokens>LIBDO_MAX_BYTES)
- {
- //report error somehow?
- return 0.0;
- }
-
- //iterate through whole m_token_freq array, but only count
- //spots that have a registered token (i.e., freq>0)
- for(i=0;i<LIBDO_MAX_BYTES;i++)
- {
- if(0!=m_token_freqs[i])
- {
- token = i;
- freq = ((float)m_token_freqs[token]);
- m_token_probs[token] = (freq / ((float)num_events));
- entropy += m_token_probs[token] * log2(m_token_probs[token]);
- }
- }
-
- bits = -1.0 * entropy;
- m_maxent = log2(m_num_tokens);
- m_ratio = bits / m_maxent;
-
- return bits;
-}
-
-int
-get_num_tokens()
-{
- return m_num_tokens;
-}
-
-float
-get_max_entropy()
-{
- return m_maxent;
-}
-
-float
-get_entropy_ratio()
-{
- return m_ratio;
-}
+++ /dev/null
-/***************************************************************************
- * libdisorder: A Library for Measuring Byte Stream Entropy
- * Copyright (C) 2010 Michael E. Locasto
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the:
- * Free Software Foundation, Inc.
- * 59 Temple Place, Suite 330
- * Boston, MA 02111-1307 USA
- *
- * $Id$
- **************************************************************************/
-
-#ifndef __DISORDER_H_
-#define __DISORDER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Max number of bytes (i.e., tokens) */
-#define LIBDO_MAX_BYTES 256
-
-/** A convienance value for clients of this library. Feel free to change
- * if you plan to use a larger buffer. You can also safely ignore it, as
- * libdisorder does not use this value internally; it relies on the
- * client-supplied `length' parameter.
- *
- * NB: Might become deprecated because it is potentially misleading and
- * has zero relationship to any library internal state.
- */
-#define LIBDO_BUFFER_LEN 16384
-
-/**
- * Given a pointer to an array of bytes, return a float indicating the
- * level of entropy in bits (a number between zero and eight),
- * assuming a space of 256 possible byte values. The second argument
- * indicates the number of bytes in the sequence. If this sequence
- * runs into unallocated memory, this function should fail with a
- * SIGSEGV.
- */
-float shannon_H(char*, long long);
-
-/** Report the number of (unique) tokens seen. This is _not_ the
- number of individual events seen. For example, if the library sees
- the string `aaab', the number of events is 4 and the number of
- tokens is 2. */
-int get_num_tokens(void);
-
-/** Returns maximum entropy for byte distributions log2(256)=8 bits*/
-float get_max_entropy(void);
-
-/** Returns the ratio of entropy to maxentropy */
-float get_entropy_ratio(void);
-
-#ifdef __cplusplus
-};
-#endif
-
-#endif
#include <stdlib.h>
#include "expat.h"
#include <getopt.h>
-#include "disorder.h"
#include "md5.h"
#include "dtl/dtl.hpp"
#include <vector>
<< data->editorid << "\t"
<< ((data->minor) ? "TRUE" : "FALSE") << "\t"
<< (unsigned int) data->text_size << "\t"
- << shannon_H(data->text, data->text_size) << "\t"
<< md5_hex_output << "\t"
<< reverted_to << "\t"
<< (int) additions.size() << "\t"
<< "a tab-separated stream of revisions on standard out:" << endl
<< endl
<< "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl
- << "text_length, text_entropy, text_md5, reversion, additions_size, deletions_size" << endl
+ << "text_length, text_md5, reversion, additions_size, deletions_size" << endl
<< ".... and additional fields for each regex executed against add/delete diffs" << endl
<< endl
<< "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl
<< "editor_id" << "\t"
<< "minor" << "\t"
<< "text_size" << "\t"
- << "text_entropy" << "\t"
<< "text_md5" << "\t"
<< "reversion" << "\t"
<< "additions_size" << "\t"