Initial code checkin

2009-01-20 10:52:53 +00:00 · 2009-01-20 10:52:53 +00:00 · 9efeb7d8f3
commit 9efeb7d8f3
5 changed files with 1163 additions and 0 deletions
--- a/65
+++ b/65
@ -0,0 +1,65 @@
+PySpamSum v1.0
+==============
+
+spamsum is a fuzzy hash specifically designed for hashing email messages
+to detect if they are SPAM. The spamsum utility includes the ability to
+generate the spamsum hash and check a new message against a existing set
+of hashes to find a match.
+
+pyspamsum is a Python wrapper for the core API of spamsum.
+
+The original spamsum code has been licensed under the terms of the
+the Perl Artistic License. It has been slightly modified
+
+The original code is Copyright Andrew Tridgell <tridge@samba.org> 2002.
+It forms part of Andrew's junkcode, and is available here:
+
+    http://www.samba.org/junkcode/#spamsum
+
+The spamsum code in this project is derived from an updated version that
+was published at Linux.conf.au 2004:
+
+    http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
+
+For details on spamsum itself, please see the spamsum README:
+
+    http://samba.org/ftp/unpacked/junkcode/spamsum/README
+
+This Python wrapper is released under the new BSD license, and is
+Copyright Russell Keith-Magee <russell@keith-magee.com> 2009.
+
+Installation
+------------
+
+At a prompt, run:
+
+$ python setup.py install
+
+Usage
+-----
+
+# Import spamsum and set up some strings
+>>> import spamsum
+>>> s1 = "I am the very model of a modern Major-General, I've information animal and vegegtable and mineral"
+>>> s2 = "I am the very model of a modern Brigadier, I've information animal and vegetable and something else"
+>>> s3 = "Huh? Gilbert and Who?"
+
+# Evaluate the edit distance between two strings
+>>> spamsum.edit_distance(s1, s1)
+28
+
+# Evaluate the spamsum of some strings
+>>> sum1 = spamsum.spamsum(s1)
+>>> sum2 = spamsum.spamsum(s2)
+>>> sum3 = spamsum.spamsum(s2)
+>>> print sum1
+3:kEvyc/sFIKwYclQY4MKLFE4IgunfELzIKygn:kE6Ai3KQ/MKOgWf/KZn
+
+# Compare two spamsums. 0 = no match, 100 = perfect match.
+>>> spamsum.match(s1, s1)
+100
+>>> spamsum.match(s1, s2)
+66
+>>> spamsum.match(s1, s3)
+0
+
--- a/edit_dist.c
+++ b/edit_dist.c
@ -0,0 +1,269 @@
+/*
+  This edit distance code is taken from trn3.6. A few minor
+  modifications have been made by Andrew Tridgell <tridge@samba.org>
+  for use in spamsum.
+*/
+
+
+/***************************************************************************/
+
+
+/* The authors make no claims as to the fitness or correctness of this software
+ * for any use whatsoever, and it is provided as is. Any use of this software
+ * is at the user's own risk.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+/* edit_dist -- returns the minimum edit distance between two strings
+
+	Program by:  Mark Maimone   CMU Computer Science   13 Nov 89
+	Last Modified:  28 Jan 90
+
+   If the input strings have length n and m, the algorithm runs in time
+   O(nm) and space O(min(m,n)).
+
+HISTORY
+   13 Nov 89 (mwm) Created edit_dist() and set_costs().
+
+   28 Jan 90 (mwm) Added view_costs().  Should verify that THRESHOLD
+   computations will work even when THRESHOLD is not a multiple of
+   sizeof(int).
+
+   17 May 93 (mwm) Improved performance when used with trn's newsgroup
+   processing; assume all costs are 1, and you can terminate when a
+   threshold is exceeded.
+*/
+
+#define MIN_DIST 100
+
+#define	TRN_SPEEDUP		/* Use a less-general version of the
+				   routine, one that's better for trn.
+				   All change costs are 1, and it's okay
+				   to terminate if the edit distance is
+				   known to exceed MIN_DIST */
+
+#define THRESHOLD 4000		/* worry about allocating more memory only
+				   when this # of bytes is exceeded */
+#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
+
+#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
+
+#define swap_int(x,y)  (_iswap = (x), (x) = (y), (y) = _iswap)
+#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
+#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
+#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
+
+
+static int insert_cost = 1;
+static int delete_cost = 1;
+#ifndef TRN_SPEEDUP
+static int change_cost = 1;
+static int swap_cost   = 1;
+#endif
+
+static int _iswap;			/* swap_int temp variable */
+static char *_cswap;			/* swap_char temp variable */
+static int _mx, _my, _mz;		/* min2, min3 temp variables */
+
+
+
+/* edit_distn -- returns the edit distance between two strings, or -1 on
+   failure */
+
+int
+edit_distn(from, from_len, to, to_len)
+char *from, *to;
+register int from_len, to_len;
+{
+#ifndef TRN_SPEEDUP
+    register int ins, del, ch;	  	/* local copies of edit costs */
+#endif
+    register int row, col, index;	/* dynamic programming counters */
+    register int radix;			/* radix for modular indexing */
+#ifdef TRN_SPEEDUP
+    register int low;
+#endif
+    int *buffer;			/* pointer to storage for one row
+					   of the d.p. array */
+    static int store[THRESHOLD / sizeof (int)];
+					/* a small amount of static
+					   storage, to be used when the
+					   input strings are small enough */
+
+/* Handle trivial cases when one string is empty */
+
+    if (from == NULL || !from_len)
+	if (to == NULL || !to_len)
+	    return 0;
+	else
+	    return to_len * insert_cost;
+    else if (to == NULL || !to_len)
+	return from_len * delete_cost;
+
+/* Initialize registers */
+
+    radix = 2 * from_len + 3;
+#ifdef TRN_SPEEDUP
+#define ins 1
+#define del 1
+#define ch 3
+#define swap_cost 5
+#else
+    ins  = insert_cost;
+    del  = delete_cost;
+    ch   = change_cost;
+#endif
+
+/* Make   from   short enough to fit in the static storage, if it's at all
+   possible */
+
+    if (from_len > to_len && from_len > STRLENTHRESHOLD) {
+	swap_int(from_len, to_len);
+	swap_char(from, to);
+#ifndef TRN_SPEEDUP
+	swap_int(ins, del);
+#endif
+    } /* if from_len > to_len */
+
+/* Allocate the array storage (from the heap if necessary) */
+
+    if (from_len <= STRLENTHRESHOLD)
+	buffer = store;
+    else
+	buffer = (int *) malloc(radix * sizeof (int));
+
+/* Here's where the fun begins.  We will find the minimum edit distance
+   using dynamic programming.  We only need to store two rows of the matrix
+   at a time, since we always progress down the matrix.  For example,
+   given the strings "one" and "two", and insert, delete and change costs
+   equal to 1:
+
+	   _  o  n  e
+	_  0  1  2  3
+	t  1  1  2  3
+	w  2  2  2  3
+	o  3  2  3  3
+
+   The dynamic programming recursion is defined as follows:
+
+	ar(x,0) := x * insert_cost
+	ar(0,y) := y * delete_cost
+	ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
+		       a(x - 1, y) + insert_cost,
+		       a(x, y - 1) + delete_cost,
+		       a(x - 2, y - 2) + (from[x] == to[y-1] &&
+					  from[x-1] == to[y] ? swap_cost :
+					  infinity))
+
+   Since this only looks at most two rows and three columns back, we need
+   only store the values for the two preceeding rows.  In this
+   implementation, we do not explicitly store the zero column, so only 2 *
+   from_len + 2   words are needed.  However, in the implementation of the
+   swap_cost   check, the current matrix value is used as a buffer; we
+   can't overwrite the earlier value until the   swap_cost   check has
+   been performed.  So we use   2 * from_len + 3   elements in the buffer.
+*/
+
+#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
+	buffer[mod(index)]))
+#define NW(x,y)	  ar(x, y, index + from_len + 2)
+#define N(x,y)	  ar(x, y, index + from_len + 3)
+#define W(x,y)	  ar(x, y, index + radix - 1)
+#define NNWW(x,y) ar(x, y, index + 1)
+#define mod(x) ((x) % radix)
+
+    index = 0;
+
+#ifdef DEBUG_EDITDIST
+    printf("      ");
+    for (col = 0; col < from_len; col++)
+	printf(" %c ", from[col]);
+    printf("\n   ");
+
+    for (col = 0; col <= from_len; col++)
+	printf("%2d ", col * del);
+#endif
+
+/* Row 0 is handled implicitly; its value at a given column is   col*del.
+   The loop below computes the values for Row 1.  At this point we know the
+   strings are nonempty.  We also don't need to consider swap costs in row
+   1.
+
+   COMMENT:  the indicies   row and col   below point into the STRING, so
+   the corresponding MATRIX indicies are   row+1 and col+1.
+*/
+
+    buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
+#ifdef TRN_SPEEDUP
+    low = buffer[mod(index + radix - 1)];
+#endif
+
+#ifdef DEBUG_EDITDIST
+    printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
+#endif
+
+    for (col = 1; col < from_len; col++) {
+	buffer[index] = min3(
+		col * del + ((from[col] == to[0]) ? 0 : ch),
+		(col + 1) * del + ins,
+		buffer[index - 1] + del);
+#ifdef TRN_SPEEDUP
+	if (buffer[index] < low)
+	    low = buffer[index];
+#endif
+	index++;
+
+#ifdef DEBUG_EDITDIST
+	printf("%2d ", buffer[index - 1]);
+#endif
+
+    } /* for col = 1 */
+
+#ifdef DEBUG_EDITDIST
+    printf("\n %c %2d ", to[1], 2 * ins);
+#endif
+
+/* Now handle the rest of the matrix */
+
+    for (row = 1; row < to_len; row++) {
+	for (col = 0; col < from_len; col++) {
+	    buffer[index] = min3(
+		    NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
+		    N(row, col + 1) + ins,
+		    W(row + 1, col) + del);
+	    if (from[col] == to[row - 1] && col > 0 &&
+		    from[col - 1] == to[row])
+		buffer[index] = min2(buffer[index],
+			NNWW(row - 1, col - 1) + swap_cost);
+
+#ifdef DEBUG_EDITDIST
+	    printf("%2d ", buffer[index]);
+#endif
+#ifdef TRN_SPEEDUP
+	    if (buffer[index] < low || col == 0)
+		low = buffer[index];
+#endif
+
+	    index = mod(index + 1);
+	} /* for col = 1 */
+#ifdef DEBUG_EDITDIST
+	if (row < to_len - 1)
+	    printf("\n %c %2d ", to[row+1], (row + 2) * ins);
+	else
+	    printf("\n");
+#endif
+#ifdef TRN_SPEEDUP
+	if (low > MIN_DIST)
+	    break;
+#endif
+    } /* for row = 1 */
+
+    row = buffer[mod(index + radix - 1)];
+    if (buffer != store)
+	free((char *) buffer);
+    return row;
+} /* edit_distn */
+
--- a/pyspamsum.c
+++ b/pyspamsum.c
@ -0,0 +1,95 @@
+/**
+ * PySpamsum v1.0.0
+ *
+ * A Python wrapper around the spamsum library written by
+ * Andrew Tridgell.
+ *
+ * Copyright 2009 Russell Keith-Magee <russell@keith-magee.com>
+ */
+#include <Python.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int edit_distn(char *from, int from_len, char *to, int to_len);
+char *spamsum(const unsigned char *in, size_t length, unsigned int flags, unsigned int bsize);
+unsigned int spamsum_match(const char *str1, const char *str2);
+
+PyObject *py_edit_distance(PyObject *self, PyObject *args)
+{
+    PyObject *result = NULL;
+    int distance;
+
+    char *from, *to;
+    int from_len, to_len;
+
+    if (!PyArg_ParseTuple(args, "s#s#", &from, &from_len, &to, &to_len))
+    {
+        return NULL;
+    }
+
+    distance = edit_distn(from, from_len, to, to_len);
+    result = Py_BuildValue("i", distance);
+
+    return result;
+}
+
+
+PyObject *py_spamsum(PyObject *self, PyObject *args)
+{
+    PyObject *result = NULL;
+    char *sum;
+
+    unsigned char *in;
+    size_t length;
+    unsigned int flags, bsize;
+
+    flags = 0;
+    bsize = 0;
+
+    if (!PyArg_ParseTuple(args, "s#|ii", &in, &length, &flags, &bsize))
+    {
+        return NULL;
+    }
+    sum = spamsum(in, length, flags, bsize);
+    result = Py_BuildValue("s", sum);
+
+    return result;
+}
+
+PyObject *py_match(PyObject *self, PyObject *args)
+{
+    PyObject *result = NULL;
+    unsigned int match;
+    char *str1, *str2;
+
+    if (!PyArg_ParseTuple(args, "ss", &str1, &str2))
+    {
+        return NULL;
+    }
+
+    match = spamsum_match(str1, str2);
+    result = Py_BuildValue("i", match);
+
+    return result;
+}
+
+
+static PyMethodDef methods[] = {
+    {"edit_distance", py_edit_distance, METH_VARARGS,
+        "Calculate the edit distance between two strings."
+    },
+    {"spamsum", py_spamsum, METH_VARARGS,
+        "Calculate the spamsum of a string."
+    },
+    {"match", py_match, METH_VARARGS,
+        "Given two spamsum strings return a value indicating the degree to which they match."
+    },
+    {NULL, NULL, 0, NULL}
+};
+
+PyMODINIT_FUNC
+initspamsum()
+{
+    (void) Py_InitModule("spamsum", methods);
+}
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,55 @@
+from distutils.core import setup, Extension
+
+setup(name = "spamsum",
+    version = "1.0.0",
+    author = "Russell Keith-Magee",
+    author_email = "russell@keith-magee.com",
+    url = 'http://code.google.com/p/pyspamsum/',
+    license = "New BSD",
+    classifiers = [
+        'Development Status :: 5 - Production/Stable',
+        'License :: OSI Approved :: BSD License',
+        'Operating System :: OS Independent',
+        'Topic :: Text Processing',
+        'Topic :: Utilities',
+    ],
+    platforms = ["any"],
+    description = "A Python wrapper for Andrew Tridgell's spamsum algorithm",
+    long_description = """
+spamsum is a fuzzy hash specifically designed for hashing email messages
+to detect if they are SPAM. The spamsum utility includes the ability to
+generate the spamsum hash and check a new message against a existing set
+of hashes to find a match.
+
+pyspamsum is a Python wrapper for the core API of spamsum.
+
+The original spamsum code has been licensed under the terms of the
+the Perl Artistic License. It has been slightly modified
+
+The original code is Copyright Andrew Tridgell <tridge@samba.org> 2002.
+It forms part of Andrew's junkcode, and is available here:
+
+    http://www.samba.org/junkcode/#spamsum
+
+The spamsum code in this project is derived from an updated version that
+was published at Linux.conf.au 2004:
+
+    http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
+
+For details on spamsum itself, please see the spamsum README:
+
+    http://samba.org/ftp/unpacked/junkcode/spamsum/README
+""",
+    classifiers = [
+
+    ],
+    ext_modules = [
+        Extension(
+            "spamsum", [
+                "pyspamsum.c",
+                "spamsum.c",
+                "edit_dist.c",
+            ]
+        )
+    ]
+)
--- a/spamsum.c
+++ b/spamsum.c
@ -0,0 +1,679 @@
+/*
+  this is a checksum routine that is specifically designed for spam.
+  Copyright Andrew Tridgell <tridge@samba.org> 2002
+
+  This code is released under the GNU General Public License version 2
+  or later.  Alteratively, you may also use this code under the terms
+  of the Perl Artistic license.
+
+  If you wish to distribute this code under the terms of a different
+  free software license then please ask me. If there is a good reason
+  then I will probably say yes.
+
+  ---
+
+  Modified by Russell Keith-Magee, 20 Jan 2009:
+  * removed the condition preventing comparison of small block sizes
+      (lines 364-366)
+  * Modified the help string to be legal cross platform C.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <ctype.h>
+
+/* the output is a string of length 64 in base64 */
+#define SPAMSUM_LENGTH 64
+
+#define MIN_BLOCKSIZE 3
+#define HASH_PRIME 0x01000193
+#define HASH_INIT 0x28021967
+
+#define ROLLING_WINDOW 7
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+typedef unsigned u32;
+typedef unsigned char uchar;
+
+#define FLAG_IGNORE_WHITESPACE 1
+#define FLAG_IGNORE_HEADERS 2
+
+static struct {
+	uchar window[ROLLING_WINDOW];
+	u32 h1, h2, h3;
+	u32 n;
+} roll_state;
+
+/*
+  a rolling hash, based on the Adler checksum. By using a rolling hash
+  we can perform auto resynchronisation after inserts/deletes
+
+  internally, h1 is the sum of the bytes in the window and h2
+  is the sum of the bytes times the index
+
+  h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
+  we can cope with large blocksize values
+*/
+static inline u32 roll_hash(uchar c)
+{
+	roll_state.h2 -= roll_state.h1;
+	roll_state.h2 += ROLLING_WINDOW * c;
+
+	roll_state.h1 += c;
+	roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
+
+	roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
+	roll_state.n++;
+
+	roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
+	roll_state.h3 ^= c;
+
+	return roll_state.h1 + roll_state.h2 + roll_state.h3;
+}
+
+/*
+  reset the state of the rolling hash and return the initial rolling hash value
+*/
+static u32 roll_reset(void)
+{
+	memset(&roll_state, 0, sizeof(roll_state));
+	return 0;
+}
+
+/* a simple non-rolling hash, based on the FNV hash */
+static inline u32 sum_hash(uchar c, u32 h)
+{
+	h *= HASH_PRIME;
+	h ^= c;
+	return h;
+}
+
+/*
+  take a message of length 'length' and return a string representing a hash of that message,
+  prefixed by the selected blocksize
+*/
+char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize)
+{
+	const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+	char *ret, *p;
+	u32 total_chars;
+	u32 h, h2, h3;
+	u32 j, n, i, k;
+	u32 block_size;
+	uchar ret2[SPAMSUM_LENGTH/2 + 1];
+
+	/* if we are ignoring email headers then skip past them now */
+	if (flags & FLAG_IGNORE_HEADERS) {
+		const uchar *s = strstr(in, "\n\n");
+		if (s) {
+			length -= (s+2 - in);
+			in = s+2;
+		}
+	}
+
+	if (flags & FLAG_IGNORE_WHITESPACE) {
+		/* count the non-ignored chars */
+		for (n=0, i=0; i<length; i++) {
+			if (isspace(in[i])) continue;
+			n++;
+		}
+		total_chars = n;
+	} else {
+		total_chars = length;
+	}
+
+	if (bsize == 0) {
+	/* guess a reasonable block size */
+		block_size = MIN_BLOCKSIZE;
+		while (block_size * SPAMSUM_LENGTH < total_chars) {
+			block_size = block_size * 2;
+		}
+	} else {
+		block_size = bsize;
+	}
+
+	ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
+	if (!ret) return NULL;
+
+again:
+	/* the first part of the spamsum signature is the blocksize */
+	snprintf(ret, 12, "%u:", block_size);
+	p = ret + strlen(ret);
+
+	memset(p, 0, SPAMSUM_LENGTH+1);
+	memset(ret2, 0, sizeof(ret2));
+
+	k = j = 0;
+	h3 = h2 = HASH_INIT;
+	h = roll_reset();
+
+	for (i=0; i<length; i++) {
+		if ((flags & FLAG_IGNORE_WHITESPACE) &&
+		    isspace(in[i])) continue;
+
+		/*
+		   at each character we update the rolling hash and
+		   the normal hash. When the rolling hash hits the
+		   reset value then we emit the normal hash as a
+		   element of the signature and reset both hashes
+		*/
+		h = roll_hash(in[i]);
+		h2 = sum_hash(in[i], h2);
+		h3 = sum_hash(in[i], h3);
+
+		if (h % block_size == (block_size-1)) {
+			/* we have hit a reset point. We now emit a
+			   hash which is based on all chacaters in the
+			   piece of the message between the last reset
+			   point and this one */
+			p[j] = b64[h2 % 64];
+			if (j < SPAMSUM_LENGTH-1) {
+				/* we can have a problem with the tail
+				   overflowing. The easiest way to
+				   cope with this is to only reset the
+				   second hash if we have room for
+				   more characters in our
+				   signature. This has the effect of
+				   combining the last few pieces of
+				   the message into a single piece */
+				h2 = HASH_INIT;
+				j++;
+			}
+		}
+
+		/* this produces a second signature with a block size
+		   of block_size*2. By producing dual signatures in
+		   this way the effect of small changes in the message
+		   size near a block size boundary is greatly reduced. */
+		if (h % (block_size*2) == ((block_size*2)-1)) {
+			ret2[k] = b64[h3 % 64];
+			if (k < SPAMSUM_LENGTH/2-1) {
+				h3 = HASH_INIT;
+				k++;
+			}
+		}
+	}
+
+	/* if we have anything left then add it to the end. This
+	   ensures that the last part of the message is always
+	   considered */
+	if (h != 0) {
+		p[j] = b64[h2 % 64];
+		ret2[k] = b64[h3 % 64];
+	}
+
+	strcat(p+j, ":");
+	strcat(p+j, ret2);
+
+	/* our blocksize guess may have been way off - repeat if necessary */
+	if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
+		block_size = block_size / 2;
+		goto again;
+	}
+
+	return ret;
+}
+
+
+/*
+   we only accept a match if we have at least one common substring in
+   the signature of length ROLLING_WINDOW. This dramatically drops the
+   false positive rate for low score thresholds while having
+   negligable affect on the rate of spam detection.
+
+   return 1 if the two strings do have a common substring, 0 otherwise
+*/
+static int has_common_substring(const char *s1, const char *s2)
+{
+	int i, j;
+	int num_hashes;
+	u32 hashes[SPAMSUM_LENGTH];
+
+	/* there are many possible algorithms for common substring
+	   detection. In this case I am re-using the rolling hash code
+	   to act as a filter for possible substring matches */
+
+	roll_reset();
+	memset(hashes, 0, sizeof(hashes));
+
+	/* first compute the windowed rolling hash at each offset in
+	   the first string */
+	for (i=0;s1[i];i++) {
+		hashes[i] = roll_hash((uchar)s1[i]);
+	}
+	num_hashes = i;
+
+	roll_reset();
+
+	/* now for each offset in the second string compute the
+	   rolling hash and compare it to all of the rolling hashes
+	   for the first string. If one matches then we have a
+	   candidate substring match. We then confirm that match with
+	   a direct string comparison */
+	for (i=0;s2[i];i++) {
+		u32 h = roll_hash((uchar)s2[i]);
+		if (i < ROLLING_WINDOW-1) continue;
+		for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
+			if (hashes[j] != 0 && hashes[j] == h) {
+				/* we have a potential match - confirm it */
+				if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
+				    strncmp(s2+i-(ROLLING_WINDOW-1),
+					    s1+j-(ROLLING_WINDOW-1),
+					    ROLLING_WINDOW) == 0) {
+					return 1;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+  eliminate sequences of longer than 3 identical characters. These
+  sequences contain very little information so they tend to just bias
+  the result unfairly
+*/
+static char *eliminate_sequences(const char *str)
+{
+	char *ret;
+	int i, j, len;
+
+	ret = strdup(str);
+	if (!ret) return NULL;
+
+	len = strlen(str);
+
+	for (i=j=3;i<len;i++) {
+		if (str[i] != str[i-1] ||
+		    str[i] != str[i-2] ||
+		    str[i] != str[i-3]) {
+			ret[j++] = str[i];
+		}
+	}
+
+	ret[j] = 0;
+
+	return ret;
+}
+
+/*
+  this is the low level string scoring algorithm. It takes two strings
+  and scores them on a scale of 0-100 where 0 is a terrible match and
+  100 is a great match. The block_size is used to cope with very small
+  messages.
+*/
+static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
+{
+	u32 score;
+	u32 len1, len2;
+	int edit_distn(const char *from, int from_len, const char *to, int to_len);
+
+	len1 = strlen(s1);
+	len2 = strlen(s2);
+
+	if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
+		/* not a real spamsum signature? */
+		return 0;
+	}
+
+	/* the two strings must have a common substring of length
+	   ROLLING_WINDOW to be candidates */
+	if (has_common_substring(s1, s2) == 0) {
+		return 0;
+	}
+
+	/* compute the edit distance between the two strings. The edit distance gives
+	   us a pretty good idea of how closely related the two strings are */
+	score = edit_distn(s1, len1, s2, len2);
+
+	/* scale the edit distance by the lengths of the two
+	   strings. This changes the score to be a measure of the
+	   proportion of the message that has changed rather than an
+	   absolute quantity. It also copes with the variability of
+	   the string lengths. */
+	score = (score * SPAMSUM_LENGTH) / (len1 + len2);
+
+	/* at this stage the score occurs roughly on a 0-64 scale,
+	 * with 0 being a good match and 64 being a complete
+	 * mismatch */
+
+	/* rescale to a 0-100 scale (friendlier to humans) */
+	score = (100 * score) / 64;
+
+	/* it is possible to get a score above 100 here, but it is a
+	   really terrible match */
+	if (score >= 100) return 0;
+
+	/* now re-scale on a 0-100 scale with 0 being a poor match and
+	   100 being a excellent match. */
+	score = 100 - score;
+
+    /* when the blocksize is small we may not want to exaggerate the match size */
+    // if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
+    //     score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
+    // }
+
+	return score;
+}
+
+/*
+  given two spamsum strings return a value indicating the degree to which they match.
+*/
+u32 spamsum_match(const char *str1, const char *str2)
+{
+	u32 block_size1, block_size2;
+	u32 score = 0;
+	char *s1, *s2;
+	char *s1_1, *s1_2;
+	char *s2_1, *s2_2;
+
+	/* each spamsum is prefixed by its block size */
+	if (sscanf(str1, "%u:", &block_size1) != 1 ||
+	    sscanf(str2, "%u:", &block_size2) != 1) {
+		return 0;
+	}
+
+	/* if the blocksizes don't match then we are comparing
+	   apples to oranges ... */
+	if (block_size1 != block_size2 &&
+	    block_size1 != block_size2*2 &&
+	    block_size2 != block_size1*2) {
+		return 0;
+	}
+
+	/* move past the prefix */
+	str1 = strchr(str1, ':');
+	str2 = strchr(str2, ':');
+
+	if (!str1 || !str2) {
+		/* badly formed ... */
+		return 0;
+	}
+
+	/* there is very little information content is sequences of
+	   the same character like 'LLLLL'. Eliminate any sequences
+	   longer than 3. This is especially important when combined
+	   with the has_common_substring() test below. */
+	s1 = eliminate_sequences(str1+1);
+	s2 = eliminate_sequences(str2+1);
+
+	if (!s1 || !s2) return -4;
+
+	/* now break them into the two pieces */
+	s1_1 = s1;
+	s2_1 = s2;
+
+	s1_2 = strchr(s1, ':');
+	s2_2 = strchr(s2, ':');
+
+	if (!s1_2 || !s2_2) {
+		/* a signature is malformed - it doesn't have 2 parts */
+		free(s1); free(s2);
+		return 0;
+	}
+
+	*s1_2++ = 0;
+	*s2_2++ = 0;
+
+	/* each signature has a string for two block sizes. We now
+	   choose how to combine the two block sizes. We checked above
+	   that they have at least one block size in common */
+	if (block_size1 == block_size2) {
+		u32 score1, score2;
+		score1 = score_strings(s1_1, s2_1, block_size1);
+		score2 = score_strings(s1_2, s2_2, block_size2);
+		score = MAX(score1, score2);
+	} else if (block_size1 == block_size2*2) {
+		score = score_strings(s1_1, s2_2, block_size1);
+	} else {
+		score = score_strings(s1_2, s2_1, block_size2);
+	}
+
+	free(s1);
+	free(s2);
+
+	return score;
+}
+
+/*
+  return the maximum match for a file containing a list of spamsums
+*/
+u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
+{
+	FILE *f;
+	char line[100];
+	u32 best = 0;
+
+	f = fopen(fname, "r");
+	if (!f) return 0;
+
+	/* on each line of the database we compute the spamsum match
+	   score. We then pick the best score */
+	while (fgets(line, sizeof(line)-1, f)) {
+		u32 score;
+		int len;
+		len = strlen(line);
+		if (line[len-1] == '\n') line[len-1] = 0;
+
+		score = spamsum_match(sum, line);
+
+		if (score > best) {
+			best = score;
+			if (best >= threshold) break;
+		}
+	}
+
+	fclose(f);
+
+	return best;
+}
+
+/*
+  return the spamsum on stdin
+*/
+static char *spamsum_stdin(u32 flags, u32 block_size)
+{
+	uchar buf[10*1024];
+	uchar *msg;
+	size_t length = 0;
+	int n;
+	char *sum;
+
+	msg = malloc(sizeof(buf));
+	if (!msg) return NULL;
+
+	/* load the file, expanding the allocation as needed. */
+	while (1) {
+		n = read(0, buf, sizeof(buf));
+		if (n == -1 && errno == EINTR) continue;
+		if (n <= 0) break;
+
+		msg = realloc(msg, length + n);
+		if (!msg) return NULL;
+
+		memcpy(msg+length, buf, n);
+		length += n;
+	}
+
+	sum = spamsum(msg, length, flags, block_size);
+
+	free(msg);
+
+	return sum;
+}
+
+
+/*
+  return the spamsum on a file
+*/
+char *spamsum_file(const char *fname, u32 flags, u32 block_size)
+{
+	int fd;
+	char *sum;
+	struct stat st;
+	uchar *msg;
+
+	if (strcmp(fname, "-") == 0) {
+		return spamsum_stdin(flags, block_size);
+	}
+
+	fd = open(fname, O_RDONLY);
+	if (fd == -1) {
+		perror(fname);
+		return NULL;
+	}
+
+	if (fstat(fd, &st) == -1) {
+		perror("fstat");
+		return NULL;
+	}
+
+	msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
+	if (msg == (uchar *)-1) {
+		perror("mmap");
+		return NULL;
+	}
+	close(fd);
+
+	sum = spamsum(msg, st.st_size, flags, block_size);
+
+	munmap(msg, st.st_size);
+
+	return sum;
+}
+
+static void show_help(void)
+{
+ printf("\n\
+spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
+\n\
+spamsum computes a signature string that is particular good for detecting if two emails\n\
+are very similar. This can be used to detect SPAM.\n\
+\n\
+Syntax:\n\
+   spamsum [options] <files>\n\
+or\n\
+   spamsum [options] -d sigs.txt -c SIG\n\
+or\n\
+   spamsum [options] -d sigs.txt -C file\n\
+\n\
+When called with a list of filenames spamsum will write out the\n\
+signatures of each file on a separate line. You can specify the\n\
+filename '-' for standard input.\n\
+\n\
+When called with the second form, spamsum will print the best score\n\
+for the given signature with the signatures in the given database. A\n\
+score of 100 means a perfect match, and a score of 0 means a complete\n\
+mismatch.\n\
+\n\
+When checking, spamsum returns 0 (success) when the message *is* spam,\n\
+1 for internal errors, and 2 for messages whose signature is not\n\
+found.\n\
+\n\
+The 3rd form is just like the second form, but you pass a file\n\
+containing a message instead of a pre-computed signature.\n\
+\n\
+Options:\n\
+   -W              ignore whitespace\n\
+   -H              skip past mail headers\n\
+   -B <bsize>      force a block size of bsize\n\
+   -T <threshold>  set the threshold above which spamsum will stop\n\
+                   looking (default 90)\n\
+");
+}
+
+int main(int argc, char *argv[])
+{
+ char *sum;
+ extern char *optarg;
+ extern int optind;
+ int c;
+ char *dbname = NULL;
+ u32 score;
+ int i;
+ u32 flags = 0;
+ u32 block_size = 0;
+ u32 threshold = 90;
+
+ while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
+     switch (c) {
+     case 'W':
+         flags |= FLAG_IGNORE_WHITESPACE;
+         break;
+
+     case 'H':
+         flags |= FLAG_IGNORE_HEADERS;
+         break;
+
+     case 'd':
+         dbname = optarg;
+         break;
+
+     case 'B':
+         block_size = atoi(optarg);
+         break;
+
+     case 'T':
+         threshold = atoi(optarg);
+         break;
+
+     case 'c':
+         if (!dbname) {
+             show_help();
+             exit(1);
+         }
+         score = spamsum_match_db(dbname, optarg,
+                      threshold);
+         printf("%u\n", score);
+         exit(score >= threshold ? 0 : 2);
+
+     case 'C':
+         if (!dbname) {
+             show_help();
+             exit(1);
+         }
+         score = spamsum_match_db(dbname,
+                      spamsum_file(optarg, flags,
+                               block_size),
+                      threshold);
+         printf("%u\n", score);
+         exit(score >= threshold ? 0 : 2);
+
+     case 'h':
+     default:
+         show_help();
+         exit(0);
+     }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc == 0) {
+     show_help();
+     return 0;
+ }
+
+ /* compute the spamsum on a list of files */
+ for (i=0;i<argc;i++) {
+     sum = spamsum_file(argv[i], flags, block_size);
+     printf("%s\n", sum);
+     free(sum);
+ }
+
+ return 0;
+}