1 /***************************************************************************
2 * libdisorder: A Library for Measuring Byte Stream Entropy
3 * Copyright (C) 2010 Michael E. Locasto
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the:
17 * Free Software Foundation, Inc.
18 * 59 Temple Place, Suite 330
19 * Boston, MA 02111-1307 USA
22 **************************************************************************/
24 #include <math.h> //for log2()
25 #include <stdio.h> //for NULL
28 #if defined(__FreeBSD__)
29 #define log2(x) (log((x)) * (1./M_LN2))
32 /** Frequecies for each byte */
33 static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample
34 static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing)
35 static int m_num_tokens = 0; //actual number of `seen' tokens, max 256
36 static float m_maxent = 0.0;
37 static float m_ratio = 0.0;
38 static int LIBDISORDER_INITIALIZED = 0;
44 if(1==LIBDISORDER_INITIALIZED)
49 for(i=0;i<LIBDO_MAX_BYTES;i++)
55 LIBDISORDER_INITIALIZED = 1;
59 * Set m_num_tokens by iterating over m_token_freq[] and maintaining
60 * a counter of each position that does not hold the value of zero.
67 for(i=0;i<LIBDO_MAX_BYTES;i++)
69 if(0!=m_token_freqs[i])
74 m_num_tokens = counter;
79 * Sum frequencies for each token (i.e., byte values 0 through 255)
80 * We assume the `length' parameter is correct.
82 * This function is available only to functions in this file.
85 get_token_frequencies(char* buf,
94 //reset number of tokens
97 //make sure freqency and probability arrays are cleared
98 for(i=0;i<LIBDO_MAX_BYTES;i++)
100 m_token_freqs[i] = 0;
101 m_token_probs[i] = 0.0;
104 for(i=0;i<length;i++)
106 c = (unsigned char)*itr;
107 //assert(0<=c<LIBDO_MAX_BYTES);
114 * Return entropy (in bits) of this buffer of bytes. We assume that the
115 * `length' parameter is correct. This implementation is a translation
116 * of the PHP code found here:
118 * http://onlamp.com/pub/a/php/2005/01/06/entropy.html
120 * with a helpful hint on the `foreach' statement from here:
122 * http://php.net/manual/en/control-structures.foreach.php
130 char* itr=NULL; //values of itr should be zero to 255
132 int num_events = 0; //`length' parameter
133 float freq = 0.0; //loop variable for holding freq from m_token_freq[]
134 float entropy = 0.0; //running entropy sum
136 if(NULL==buf || 0==length)
139 if(0==LIBDISORDER_INITIALIZED)
146 get_token_frequencies(itr, num_events); //modifies m_token_freqs[]
147 //set m_num_tokens by counting unique m_token_freqs entries
150 if(m_num_tokens>LIBDO_MAX_BYTES)
152 //report error somehow?
156 //iterate through whole m_token_freq array, but only count
157 //spots that have a registered token (i.e., freq>0)
158 for(i=0;i<LIBDO_MAX_BYTES;i++)
160 if(0!=m_token_freqs[i])
163 freq = ((float)m_token_freqs[token]);
164 m_token_probs[token] = (freq / ((float)num_events));
165 entropy += m_token_probs[token] * log2(m_token_probs[token]);
169 bits = -1.0 * entropy;
170 m_maxent = log2(m_num_tokens);
171 m_ratio = bits / m_maxent;