Initial code checkin
This commit is contained in:
commit
9efeb7d8f3
5 changed files with 1163 additions and 0 deletions
65
README
Normal file
65
README
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
PySpamSum v1.0
|
||||
==============
|
||||
|
||||
spamsum is a fuzzy hash specifically designed for hashing email messages
|
||||
to detect if they are SPAM. The spamsum utility includes the ability to
|
||||
generate the spamsum hash and check a new message against a existing set
|
||||
of hashes to find a match.
|
||||
|
||||
pyspamsum is a Python wrapper for the core API of spamsum.
|
||||
|
||||
The original spamsum code has been licensed under the terms of the
|
||||
the Perl Artistic License. It has been slightly modified
|
||||
|
||||
The original code is Copyright Andrew Tridgell <tridge@samba.org> 2002.
|
||||
It forms part of Andrew's junkcode, and is available here:
|
||||
|
||||
http://www.samba.org/junkcode/#spamsum
|
||||
|
||||
The spamsum code in this project is derived from an updated version that
|
||||
was published at Linux.conf.au 2004:
|
||||
|
||||
http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
|
||||
|
||||
For details on spamsum itself, please see the spamsum README:
|
||||
|
||||
http://samba.org/ftp/unpacked/junkcode/spamsum/README
|
||||
|
||||
This Python wrapper is released under the new BSD license, and is
|
||||
Copyright Russell Keith-Magee <russell@keith-magee.com> 2009.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
At a prompt, run:
|
||||
|
||||
$ python setup.py install
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
# Import spamsum and set up some strings
|
||||
>>> import spamsum
|
||||
>>> s1 = "I am the very model of a modern Major-General, I've information animal and vegegtable and mineral"
|
||||
>>> s2 = "I am the very model of a modern Brigadier, I've information animal and vegetable and something else"
|
||||
>>> s3 = "Huh? Gilbert and Who?"
|
||||
|
||||
# Evaluate the edit distance between two strings
|
||||
>>> spamsum.edit_distance(s1, s1)
|
||||
28
|
||||
|
||||
# Evaluate the spamsum of some strings
|
||||
>>> sum1 = spamsum.spamsum(s1)
|
||||
>>> sum2 = spamsum.spamsum(s2)
|
||||
>>> sum3 = spamsum.spamsum(s2)
|
||||
>>> print sum1
|
||||
3:kEvyc/sFIKwYclQY4MKLFE4IgunfELzIKygn:kE6Ai3KQ/MKOgWf/KZn
|
||||
|
||||
# Compare two spamsums. 0 = no match, 100 = perfect match.
|
||||
>>> spamsum.match(s1, s1)
|
||||
100
|
||||
>>> spamsum.match(s1, s2)
|
||||
66
|
||||
>>> spamsum.match(s1, s3)
|
||||
0
|
||||
|
||||
269
edit_dist.c
Normal file
269
edit_dist.c
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
/*
|
||||
This edit distance code is taken from trn3.6. A few minor
|
||||
modifications have been made by Andrew Tridgell <tridge@samba.org>
|
||||
for use in spamsum.
|
||||
*/
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
|
||||
/* The authors make no claims as to the fitness or correctness of this software
|
||||
* for any use whatsoever, and it is provided as is. Any use of this software
|
||||
* is at the user's own risk.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* edit_dist -- returns the minimum edit distance between two strings
|
||||
|
||||
Program by: Mark Maimone CMU Computer Science 13 Nov 89
|
||||
Last Modified: 28 Jan 90
|
||||
|
||||
If the input strings have length n and m, the algorithm runs in time
|
||||
O(nm) and space O(min(m,n)).
|
||||
|
||||
HISTORY
|
||||
13 Nov 89 (mwm) Created edit_dist() and set_costs().
|
||||
|
||||
28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
|
||||
computations will work even when THRESHOLD is not a multiple of
|
||||
sizeof(int).
|
||||
|
||||
17 May 93 (mwm) Improved performance when used with trn's newsgroup
|
||||
processing; assume all costs are 1, and you can terminate when a
|
||||
threshold is exceeded.
|
||||
*/
|
||||
|
||||
#define MIN_DIST 100
|
||||
|
||||
#define TRN_SPEEDUP /* Use a less-general version of the
|
||||
routine, one that's better for trn.
|
||||
All change costs are 1, and it's okay
|
||||
to terminate if the edit distance is
|
||||
known to exceed MIN_DIST */
|
||||
|
||||
#define THRESHOLD 4000 /* worry about allocating more memory only
|
||||
when this # of bytes is exceeded */
|
||||
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
|
||||
|
||||
#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
|
||||
|
||||
#define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
|
||||
#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
|
||||
#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
|
||||
#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
|
||||
|
||||
|
||||
static int insert_cost = 1;
|
||||
static int delete_cost = 1;
|
||||
#ifndef TRN_SPEEDUP
|
||||
static int change_cost = 1;
|
||||
static int swap_cost = 1;
|
||||
#endif
|
||||
|
||||
static int _iswap; /* swap_int temp variable */
|
||||
static char *_cswap; /* swap_char temp variable */
|
||||
static int _mx, _my, _mz; /* min2, min3 temp variables */
|
||||
|
||||
|
||||
|
||||
/* edit_distn -- returns the edit distance between two strings, or -1 on
|
||||
failure */
|
||||
|
||||
int
|
||||
edit_distn(from, from_len, to, to_len)
|
||||
char *from, *to;
|
||||
register int from_len, to_len;
|
||||
{
|
||||
#ifndef TRN_SPEEDUP
|
||||
register int ins, del, ch; /* local copies of edit costs */
|
||||
#endif
|
||||
register int row, col, index; /* dynamic programming counters */
|
||||
register int radix; /* radix for modular indexing */
|
||||
#ifdef TRN_SPEEDUP
|
||||
register int low;
|
||||
#endif
|
||||
int *buffer; /* pointer to storage for one row
|
||||
of the d.p. array */
|
||||
static int store[THRESHOLD / sizeof (int)];
|
||||
/* a small amount of static
|
||||
storage, to be used when the
|
||||
input strings are small enough */
|
||||
|
||||
/* Handle trivial cases when one string is empty */
|
||||
|
||||
if (from == NULL || !from_len)
|
||||
if (to == NULL || !to_len)
|
||||
return 0;
|
||||
else
|
||||
return to_len * insert_cost;
|
||||
else if (to == NULL || !to_len)
|
||||
return from_len * delete_cost;
|
||||
|
||||
/* Initialize registers */
|
||||
|
||||
radix = 2 * from_len + 3;
|
||||
#ifdef TRN_SPEEDUP
|
||||
#define ins 1
|
||||
#define del 1
|
||||
#define ch 3
|
||||
#define swap_cost 5
|
||||
#else
|
||||
ins = insert_cost;
|
||||
del = delete_cost;
|
||||
ch = change_cost;
|
||||
#endif
|
||||
|
||||
/* Make from short enough to fit in the static storage, if it's at all
|
||||
possible */
|
||||
|
||||
if (from_len > to_len && from_len > STRLENTHRESHOLD) {
|
||||
swap_int(from_len, to_len);
|
||||
swap_char(from, to);
|
||||
#ifndef TRN_SPEEDUP
|
||||
swap_int(ins, del);
|
||||
#endif
|
||||
} /* if from_len > to_len */
|
||||
|
||||
/* Allocate the array storage (from the heap if necessary) */
|
||||
|
||||
if (from_len <= STRLENTHRESHOLD)
|
||||
buffer = store;
|
||||
else
|
||||
buffer = (int *) malloc(radix * sizeof (int));
|
||||
|
||||
/* Here's where the fun begins. We will find the minimum edit distance
|
||||
using dynamic programming. We only need to store two rows of the matrix
|
||||
at a time, since we always progress down the matrix. For example,
|
||||
given the strings "one" and "two", and insert, delete and change costs
|
||||
equal to 1:
|
||||
|
||||
_ o n e
|
||||
_ 0 1 2 3
|
||||
t 1 1 2 3
|
||||
w 2 2 2 3
|
||||
o 3 2 3 3
|
||||
|
||||
The dynamic programming recursion is defined as follows:
|
||||
|
||||
ar(x,0) := x * insert_cost
|
||||
ar(0,y) := y * delete_cost
|
||||
ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
|
||||
a(x - 1, y) + insert_cost,
|
||||
a(x, y - 1) + delete_cost,
|
||||
a(x - 2, y - 2) + (from[x] == to[y-1] &&
|
||||
from[x-1] == to[y] ? swap_cost :
|
||||
infinity))
|
||||
|
||||
Since this only looks at most two rows and three columns back, we need
|
||||
only store the values for the two preceeding rows. In this
|
||||
implementation, we do not explicitly store the zero column, so only 2 *
|
||||
from_len + 2 words are needed. However, in the implementation of the
|
||||
swap_cost check, the current matrix value is used as a buffer; we
|
||||
can't overwrite the earlier value until the swap_cost check has
|
||||
been performed. So we use 2 * from_len + 3 elements in the buffer.
|
||||
*/
|
||||
|
||||
#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
|
||||
buffer[mod(index)]))
|
||||
#define NW(x,y) ar(x, y, index + from_len + 2)
|
||||
#define N(x,y) ar(x, y, index + from_len + 3)
|
||||
#define W(x,y) ar(x, y, index + radix - 1)
|
||||
#define NNWW(x,y) ar(x, y, index + 1)
|
||||
#define mod(x) ((x) % radix)
|
||||
|
||||
index = 0;
|
||||
|
||||
#ifdef DEBUG_EDITDIST
|
||||
printf(" ");
|
||||
for (col = 0; col < from_len; col++)
|
||||
printf(" %c ", from[col]);
|
||||
printf("\n ");
|
||||
|
||||
for (col = 0; col <= from_len; col++)
|
||||
printf("%2d ", col * del);
|
||||
#endif
|
||||
|
||||
/* Row 0 is handled implicitly; its value at a given column is col*del.
|
||||
The loop below computes the values for Row 1. At this point we know the
|
||||
strings are nonempty. We also don't need to consider swap costs in row
|
||||
1.
|
||||
|
||||
COMMENT: the indicies row and col below point into the STRING, so
|
||||
the corresponding MATRIX indicies are row+1 and col+1.
|
||||
*/
|
||||
|
||||
buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
|
||||
#ifdef TRN_SPEEDUP
|
||||
low = buffer[mod(index + radix - 1)];
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_EDITDIST
|
||||
printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
|
||||
#endif
|
||||
|
||||
for (col = 1; col < from_len; col++) {
|
||||
buffer[index] = min3(
|
||||
col * del + ((from[col] == to[0]) ? 0 : ch),
|
||||
(col + 1) * del + ins,
|
||||
buffer[index - 1] + del);
|
||||
#ifdef TRN_SPEEDUP
|
||||
if (buffer[index] < low)
|
||||
low = buffer[index];
|
||||
#endif
|
||||
index++;
|
||||
|
||||
#ifdef DEBUG_EDITDIST
|
||||
printf("%2d ", buffer[index - 1]);
|
||||
#endif
|
||||
|
||||
} /* for col = 1 */
|
||||
|
||||
#ifdef DEBUG_EDITDIST
|
||||
printf("\n %c %2d ", to[1], 2 * ins);
|
||||
#endif
|
||||
|
||||
/* Now handle the rest of the matrix */
|
||||
|
||||
for (row = 1; row < to_len; row++) {
|
||||
for (col = 0; col < from_len; col++) {
|
||||
buffer[index] = min3(
|
||||
NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
|
||||
N(row, col + 1) + ins,
|
||||
W(row + 1, col) + del);
|
||||
if (from[col] == to[row - 1] && col > 0 &&
|
||||
from[col - 1] == to[row])
|
||||
buffer[index] = min2(buffer[index],
|
||||
NNWW(row - 1, col - 1) + swap_cost);
|
||||
|
||||
#ifdef DEBUG_EDITDIST
|
||||
printf("%2d ", buffer[index]);
|
||||
#endif
|
||||
#ifdef TRN_SPEEDUP
|
||||
if (buffer[index] < low || col == 0)
|
||||
low = buffer[index];
|
||||
#endif
|
||||
|
||||
index = mod(index + 1);
|
||||
} /* for col = 1 */
|
||||
#ifdef DEBUG_EDITDIST
|
||||
if (row < to_len - 1)
|
||||
printf("\n %c %2d ", to[row+1], (row + 2) * ins);
|
||||
else
|
||||
printf("\n");
|
||||
#endif
|
||||
#ifdef TRN_SPEEDUP
|
||||
if (low > MIN_DIST)
|
||||
break;
|
||||
#endif
|
||||
} /* for row = 1 */
|
||||
|
||||
row = buffer[mod(index + radix - 1)];
|
||||
if (buffer != store)
|
||||
free((char *) buffer);
|
||||
return row;
|
||||
} /* edit_distn */
|
||||
|
||||
95
pyspamsum.c
Normal file
95
pyspamsum.c
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
/**
|
||||
* PySpamsum v1.0.0
|
||||
*
|
||||
* A Python wrapper around the spamsum library written by
|
||||
* Andrew Tridgell.
|
||||
*
|
||||
* Copyright 2009 Russell Keith-Magee <russell@keith-magee.com>
|
||||
*/
|
||||
#include <Python.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int edit_distn(char *from, int from_len, char *to, int to_len);
|
||||
char *spamsum(const unsigned char *in, size_t length, unsigned int flags, unsigned int bsize);
|
||||
unsigned int spamsum_match(const char *str1, const char *str2);
|
||||
|
||||
PyObject *py_edit_distance(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *result = NULL;
|
||||
int distance;
|
||||
|
||||
char *from, *to;
|
||||
int from_len, to_len;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "s#s#", &from, &from_len, &to, &to_len))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
distance = edit_distn(from, from_len, to, to_len);
|
||||
result = Py_BuildValue("i", distance);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
PyObject *py_spamsum(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *result = NULL;
|
||||
char *sum;
|
||||
|
||||
unsigned char *in;
|
||||
size_t length;
|
||||
unsigned int flags, bsize;
|
||||
|
||||
flags = 0;
|
||||
bsize = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "s#|ii", &in, &length, &flags, &bsize))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
sum = spamsum(in, length, flags, bsize);
|
||||
result = Py_BuildValue("s", sum);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
PyObject *py_match(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *result = NULL;
|
||||
unsigned int match;
|
||||
char *str1, *str2;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "ss", &str1, &str2))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
match = spamsum_match(str1, str2);
|
||||
result = Py_BuildValue("i", match);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef methods[] = {
|
||||
{"edit_distance", py_edit_distance, METH_VARARGS,
|
||||
"Calculate the edit distance between two strings."
|
||||
},
|
||||
{"spamsum", py_spamsum, METH_VARARGS,
|
||||
"Calculate the spamsum of a string."
|
||||
},
|
||||
{"match", py_match, METH_VARARGS,
|
||||
"Given two spamsum strings return a value indicating the degree to which they match."
|
||||
},
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC
|
||||
initspamsum()
|
||||
{
|
||||
(void) Py_InitModule("spamsum", methods);
|
||||
}
|
||||
55
setup.py
Normal file
55
setup.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
from distutils.core import setup, Extension
|
||||
|
||||
setup(name = "spamsum",
|
||||
version = "1.0.0",
|
||||
author = "Russell Keith-Magee",
|
||||
author_email = "russell@keith-magee.com",
|
||||
url = 'http://code.google.com/p/pyspamsum/',
|
||||
license = "New BSD",
|
||||
classifiers = [
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'License :: OSI Approved :: BSD License',
|
||||
'Operating System :: OS Independent',
|
||||
'Topic :: Text Processing',
|
||||
'Topic :: Utilities',
|
||||
],
|
||||
platforms = ["any"],
|
||||
description = "A Python wrapper for Andrew Tridgell's spamsum algorithm",
|
||||
long_description = """
|
||||
spamsum is a fuzzy hash specifically designed for hashing email messages
|
||||
to detect if they are SPAM. The spamsum utility includes the ability to
|
||||
generate the spamsum hash and check a new message against a existing set
|
||||
of hashes to find a match.
|
||||
|
||||
pyspamsum is a Python wrapper for the core API of spamsum.
|
||||
|
||||
The original spamsum code has been licensed under the terms of the
|
||||
the Perl Artistic License. It has been slightly modified
|
||||
|
||||
The original code is Copyright Andrew Tridgell <tridge@samba.org> 2002.
|
||||
It forms part of Andrew's junkcode, and is available here:
|
||||
|
||||
http://www.samba.org/junkcode/#spamsum
|
||||
|
||||
The spamsum code in this project is derived from an updated version that
|
||||
was published at Linux.conf.au 2004:
|
||||
|
||||
http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
|
||||
|
||||
For details on spamsum itself, please see the spamsum README:
|
||||
|
||||
http://samba.org/ftp/unpacked/junkcode/spamsum/README
|
||||
""",
|
||||
classifiers = [
|
||||
|
||||
],
|
||||
ext_modules = [
|
||||
Extension(
|
||||
"spamsum", [
|
||||
"pyspamsum.c",
|
||||
"spamsum.c",
|
||||
"edit_dist.c",
|
||||
]
|
||||
)
|
||||
]
|
||||
)
|
||||
679
spamsum.c
Normal file
679
spamsum.c
Normal file
|
|
@ -0,0 +1,679 @@
|
|||
/*
|
||||
this is a checksum routine that is specifically designed for spam.
|
||||
Copyright Andrew Tridgell <tridge@samba.org> 2002
|
||||
|
||||
This code is released under the GNU General Public License version 2
|
||||
or later. Alteratively, you may also use this code under the terms
|
||||
of the Perl Artistic license.
|
||||
|
||||
If you wish to distribute this code under the terms of a different
|
||||
free software license then please ask me. If there is a good reason
|
||||
then I will probably say yes.
|
||||
|
||||
---
|
||||
|
||||
Modified by Russell Keith-Magee, 20 Jan 2009:
|
||||
* removed the condition preventing comparison of small block sizes
|
||||
(lines 364-366)
|
||||
* Modified the help string to be legal cross platform C.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <ctype.h>
|
||||
|
||||
/* the output is a string of length 64 in base64 */
|
||||
#define SPAMSUM_LENGTH 64
|
||||
|
||||
#define MIN_BLOCKSIZE 3
|
||||
#define HASH_PRIME 0x01000193
|
||||
#define HASH_INIT 0x28021967
|
||||
|
||||
#define ROLLING_WINDOW 7
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a,b) ((a)>(b)?(a):(b))
|
||||
#endif
|
||||
|
||||
typedef unsigned u32;
|
||||
typedef unsigned char uchar;
|
||||
|
||||
#define FLAG_IGNORE_WHITESPACE 1
|
||||
#define FLAG_IGNORE_HEADERS 2
|
||||
|
||||
static struct {
|
||||
uchar window[ROLLING_WINDOW];
|
||||
u32 h1, h2, h3;
|
||||
u32 n;
|
||||
} roll_state;
|
||||
|
||||
/*
|
||||
a rolling hash, based on the Adler checksum. By using a rolling hash
|
||||
we can perform auto resynchronisation after inserts/deletes
|
||||
|
||||
internally, h1 is the sum of the bytes in the window and h2
|
||||
is the sum of the bytes times the index
|
||||
|
||||
h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
|
||||
we can cope with large blocksize values
|
||||
*/
|
||||
static inline u32 roll_hash(uchar c)
|
||||
{
|
||||
roll_state.h2 -= roll_state.h1;
|
||||
roll_state.h2 += ROLLING_WINDOW * c;
|
||||
|
||||
roll_state.h1 += c;
|
||||
roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
|
||||
|
||||
roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
|
||||
roll_state.n++;
|
||||
|
||||
roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
|
||||
roll_state.h3 ^= c;
|
||||
|
||||
return roll_state.h1 + roll_state.h2 + roll_state.h3;
|
||||
}
|
||||
|
||||
/*
|
||||
reset the state of the rolling hash and return the initial rolling hash value
|
||||
*/
|
||||
static u32 roll_reset(void)
|
||||
{
|
||||
memset(&roll_state, 0, sizeof(roll_state));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* a simple non-rolling hash, based on the FNV hash */
|
||||
static inline u32 sum_hash(uchar c, u32 h)
|
||||
{
|
||||
h *= HASH_PRIME;
|
||||
h ^= c;
|
||||
return h;
|
||||
}
|
||||
|
||||
/*
|
||||
take a message of length 'length' and return a string representing a hash of that message,
|
||||
prefixed by the selected blocksize
|
||||
*/
|
||||
char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize)
|
||||
{
|
||||
const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||||
char *ret, *p;
|
||||
u32 total_chars;
|
||||
u32 h, h2, h3;
|
||||
u32 j, n, i, k;
|
||||
u32 block_size;
|
||||
uchar ret2[SPAMSUM_LENGTH/2 + 1];
|
||||
|
||||
/* if we are ignoring email headers then skip past them now */
|
||||
if (flags & FLAG_IGNORE_HEADERS) {
|
||||
const uchar *s = strstr(in, "\n\n");
|
||||
if (s) {
|
||||
length -= (s+2 - in);
|
||||
in = s+2;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & FLAG_IGNORE_WHITESPACE) {
|
||||
/* count the non-ignored chars */
|
||||
for (n=0, i=0; i<length; i++) {
|
||||
if (isspace(in[i])) continue;
|
||||
n++;
|
||||
}
|
||||
total_chars = n;
|
||||
} else {
|
||||
total_chars = length;
|
||||
}
|
||||
|
||||
if (bsize == 0) {
|
||||
/* guess a reasonable block size */
|
||||
block_size = MIN_BLOCKSIZE;
|
||||
while (block_size * SPAMSUM_LENGTH < total_chars) {
|
||||
block_size = block_size * 2;
|
||||
}
|
||||
} else {
|
||||
block_size = bsize;
|
||||
}
|
||||
|
||||
ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
|
||||
if (!ret) return NULL;
|
||||
|
||||
again:
|
||||
/* the first part of the spamsum signature is the blocksize */
|
||||
snprintf(ret, 12, "%u:", block_size);
|
||||
p = ret + strlen(ret);
|
||||
|
||||
memset(p, 0, SPAMSUM_LENGTH+1);
|
||||
memset(ret2, 0, sizeof(ret2));
|
||||
|
||||
k = j = 0;
|
||||
h3 = h2 = HASH_INIT;
|
||||
h = roll_reset();
|
||||
|
||||
for (i=0; i<length; i++) {
|
||||
if ((flags & FLAG_IGNORE_WHITESPACE) &&
|
||||
isspace(in[i])) continue;
|
||||
|
||||
/*
|
||||
at each character we update the rolling hash and
|
||||
the normal hash. When the rolling hash hits the
|
||||
reset value then we emit the normal hash as a
|
||||
element of the signature and reset both hashes
|
||||
*/
|
||||
h = roll_hash(in[i]);
|
||||
h2 = sum_hash(in[i], h2);
|
||||
h3 = sum_hash(in[i], h3);
|
||||
|
||||
if (h % block_size == (block_size-1)) {
|
||||
/* we have hit a reset point. We now emit a
|
||||
hash which is based on all chacaters in the
|
||||
piece of the message between the last reset
|
||||
point and this one */
|
||||
p[j] = b64[h2 % 64];
|
||||
if (j < SPAMSUM_LENGTH-1) {
|
||||
/* we can have a problem with the tail
|
||||
overflowing. The easiest way to
|
||||
cope with this is to only reset the
|
||||
second hash if we have room for
|
||||
more characters in our
|
||||
signature. This has the effect of
|
||||
combining the last few pieces of
|
||||
the message into a single piece */
|
||||
h2 = HASH_INIT;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
/* this produces a second signature with a block size
|
||||
of block_size*2. By producing dual signatures in
|
||||
this way the effect of small changes in the message
|
||||
size near a block size boundary is greatly reduced. */
|
||||
if (h % (block_size*2) == ((block_size*2)-1)) {
|
||||
ret2[k] = b64[h3 % 64];
|
||||
if (k < SPAMSUM_LENGTH/2-1) {
|
||||
h3 = HASH_INIT;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* if we have anything left then add it to the end. This
|
||||
ensures that the last part of the message is always
|
||||
considered */
|
||||
if (h != 0) {
|
||||
p[j] = b64[h2 % 64];
|
||||
ret2[k] = b64[h3 % 64];
|
||||
}
|
||||
|
||||
strcat(p+j, ":");
|
||||
strcat(p+j, ret2);
|
||||
|
||||
/* our blocksize guess may have been way off - repeat if necessary */
|
||||
if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
|
||||
block_size = block_size / 2;
|
||||
goto again;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
we only accept a match if we have at least one common substring in
|
||||
the signature of length ROLLING_WINDOW. This dramatically drops the
|
||||
false positive rate for low score thresholds while having
|
||||
negligable affect on the rate of spam detection.
|
||||
|
||||
return 1 if the two strings do have a common substring, 0 otherwise
|
||||
*/
|
||||
static int has_common_substring(const char *s1, const char *s2)
|
||||
{
|
||||
int i, j;
|
||||
int num_hashes;
|
||||
u32 hashes[SPAMSUM_LENGTH];
|
||||
|
||||
/* there are many possible algorithms for common substring
|
||||
detection. In this case I am re-using the rolling hash code
|
||||
to act as a filter for possible substring matches */
|
||||
|
||||
roll_reset();
|
||||
memset(hashes, 0, sizeof(hashes));
|
||||
|
||||
/* first compute the windowed rolling hash at each offset in
|
||||
the first string */
|
||||
for (i=0;s1[i];i++) {
|
||||
hashes[i] = roll_hash((uchar)s1[i]);
|
||||
}
|
||||
num_hashes = i;
|
||||
|
||||
roll_reset();
|
||||
|
||||
/* now for each offset in the second string compute the
|
||||
rolling hash and compare it to all of the rolling hashes
|
||||
for the first string. If one matches then we have a
|
||||
candidate substring match. We then confirm that match with
|
||||
a direct string comparison */
|
||||
for (i=0;s2[i];i++) {
|
||||
u32 h = roll_hash((uchar)s2[i]);
|
||||
if (i < ROLLING_WINDOW-1) continue;
|
||||
for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
|
||||
if (hashes[j] != 0 && hashes[j] == h) {
|
||||
/* we have a potential match - confirm it */
|
||||
if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
|
||||
strncmp(s2+i-(ROLLING_WINDOW-1),
|
||||
s1+j-(ROLLING_WINDOW-1),
|
||||
ROLLING_WINDOW) == 0) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
eliminate sequences of longer than 3 identical characters. These
|
||||
sequences contain very little information so they tend to just bias
|
||||
the result unfairly
|
||||
*/
|
||||
static char *eliminate_sequences(const char *str)
|
||||
{
|
||||
char *ret;
|
||||
int i, j, len;
|
||||
|
||||
ret = strdup(str);
|
||||
if (!ret) return NULL;
|
||||
|
||||
len = strlen(str);
|
||||
|
||||
for (i=j=3;i<len;i++) {
|
||||
if (str[i] != str[i-1] ||
|
||||
str[i] != str[i-2] ||
|
||||
str[i] != str[i-3]) {
|
||||
ret[j++] = str[i];
|
||||
}
|
||||
}
|
||||
|
||||
ret[j] = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
this is the low level string scoring algorithm. It takes two strings
|
||||
and scores them on a scale of 0-100 where 0 is a terrible match and
|
||||
100 is a great match. The block_size is used to cope with very small
|
||||
messages.
|
||||
*/
|
||||
static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
|
||||
{
|
||||
u32 score;
|
||||
u32 len1, len2;
|
||||
int edit_distn(const char *from, int from_len, const char *to, int to_len);
|
||||
|
||||
len1 = strlen(s1);
|
||||
len2 = strlen(s2);
|
||||
|
||||
if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
|
||||
/* not a real spamsum signature? */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* the two strings must have a common substring of length
|
||||
ROLLING_WINDOW to be candidates */
|
||||
if (has_common_substring(s1, s2) == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* compute the edit distance between the two strings. The edit distance gives
|
||||
us a pretty good idea of how closely related the two strings are */
|
||||
score = edit_distn(s1, len1, s2, len2);
|
||||
|
||||
/* scale the edit distance by the lengths of the two
|
||||
strings. This changes the score to be a measure of the
|
||||
proportion of the message that has changed rather than an
|
||||
absolute quantity. It also copes with the variability of
|
||||
the string lengths. */
|
||||
score = (score * SPAMSUM_LENGTH) / (len1 + len2);
|
||||
|
||||
/* at this stage the score occurs roughly on a 0-64 scale,
|
||||
* with 0 being a good match and 64 being a complete
|
||||
* mismatch */
|
||||
|
||||
/* rescale to a 0-100 scale (friendlier to humans) */
|
||||
score = (100 * score) / 64;
|
||||
|
||||
/* it is possible to get a score above 100 here, but it is a
|
||||
really terrible match */
|
||||
if (score >= 100) return 0;
|
||||
|
||||
/* now re-scale on a 0-100 scale with 0 being a poor match and
|
||||
100 being a excellent match. */
|
||||
score = 100 - score;
|
||||
|
||||
/* when the blocksize is small we may not want to exaggerate the match size */
|
||||
// if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
|
||||
// score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
|
||||
// }
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/*
|
||||
given two spamsum strings return a value indicating the degree to which they match.
|
||||
*/
|
||||
u32 spamsum_match(const char *str1, const char *str2)
|
||||
{
|
||||
u32 block_size1, block_size2;
|
||||
u32 score = 0;
|
||||
char *s1, *s2;
|
||||
char *s1_1, *s1_2;
|
||||
char *s2_1, *s2_2;
|
||||
|
||||
/* each spamsum is prefixed by its block size */
|
||||
if (sscanf(str1, "%u:", &block_size1) != 1 ||
|
||||
sscanf(str2, "%u:", &block_size2) != 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* if the blocksizes don't match then we are comparing
|
||||
apples to oranges ... */
|
||||
if (block_size1 != block_size2 &&
|
||||
block_size1 != block_size2*2 &&
|
||||
block_size2 != block_size1*2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* move past the prefix */
|
||||
str1 = strchr(str1, ':');
|
||||
str2 = strchr(str2, ':');
|
||||
|
||||
if (!str1 || !str2) {
|
||||
/* badly formed ... */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* there is very little information content is sequences of
|
||||
the same character like 'LLLLL'. Eliminate any sequences
|
||||
longer than 3. This is especially important when combined
|
||||
with the has_common_substring() test below. */
|
||||
s1 = eliminate_sequences(str1+1);
|
||||
s2 = eliminate_sequences(str2+1);
|
||||
|
||||
if (!s1 || !s2) return -4;
|
||||
|
||||
/* now break them into the two pieces */
|
||||
s1_1 = s1;
|
||||
s2_1 = s2;
|
||||
|
||||
s1_2 = strchr(s1, ':');
|
||||
s2_2 = strchr(s2, ':');
|
||||
|
||||
if (!s1_2 || !s2_2) {
|
||||
/* a signature is malformed - it doesn't have 2 parts */
|
||||
free(s1); free(s2);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*s1_2++ = 0;
|
||||
*s2_2++ = 0;
|
||||
|
||||
/* each signature has a string for two block sizes. We now
|
||||
choose how to combine the two block sizes. We checked above
|
||||
that they have at least one block size in common */
|
||||
if (block_size1 == block_size2) {
|
||||
u32 score1, score2;
|
||||
score1 = score_strings(s1_1, s2_1, block_size1);
|
||||
score2 = score_strings(s1_2, s2_2, block_size2);
|
||||
score = MAX(score1, score2);
|
||||
} else if (block_size1 == block_size2*2) {
|
||||
score = score_strings(s1_1, s2_2, block_size1);
|
||||
} else {
|
||||
score = score_strings(s1_2, s2_1, block_size2);
|
||||
}
|
||||
|
||||
free(s1);
|
||||
free(s2);
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/*
|
||||
return the maximum match for a file containing a list of spamsums
|
||||
*/
|
||||
u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
|
||||
{
|
||||
FILE *f;
|
||||
char line[100];
|
||||
u32 best = 0;
|
||||
|
||||
f = fopen(fname, "r");
|
||||
if (!f) return 0;
|
||||
|
||||
/* on each line of the database we compute the spamsum match
|
||||
score. We then pick the best score */
|
||||
while (fgets(line, sizeof(line)-1, f)) {
|
||||
u32 score;
|
||||
int len;
|
||||
len = strlen(line);
|
||||
if (line[len-1] == '\n') line[len-1] = 0;
|
||||
|
||||
score = spamsum_match(sum, line);
|
||||
|
||||
if (score > best) {
|
||||
best = score;
|
||||
if (best >= threshold) break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(f);
|
||||
|
||||
return best;
|
||||
}
|
||||
|
||||
/*
|
||||
return the spamsum on stdin
|
||||
*/
|
||||
static char *spamsum_stdin(u32 flags, u32 block_size)
|
||||
{
|
||||
uchar buf[10*1024];
|
||||
uchar *msg;
|
||||
size_t length = 0;
|
||||
int n;
|
||||
char *sum;
|
||||
|
||||
msg = malloc(sizeof(buf));
|
||||
if (!msg) return NULL;
|
||||
|
||||
/* load the file, expanding the allocation as needed. */
|
||||
while (1) {
|
||||
n = read(0, buf, sizeof(buf));
|
||||
if (n == -1 && errno == EINTR) continue;
|
||||
if (n <= 0) break;
|
||||
|
||||
msg = realloc(msg, length + n);
|
||||
if (!msg) return NULL;
|
||||
|
||||
memcpy(msg+length, buf, n);
|
||||
length += n;
|
||||
}
|
||||
|
||||
sum = spamsum(msg, length, flags, block_size);
|
||||
|
||||
free(msg);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
return the spamsum on a file
|
||||
*/
|
||||
char *spamsum_file(const char *fname, u32 flags, u32 block_size)
|
||||
{
|
||||
int fd;
|
||||
char *sum;
|
||||
struct stat st;
|
||||
uchar *msg;
|
||||
|
||||
if (strcmp(fname, "-") == 0) {
|
||||
return spamsum_stdin(flags, block_size);
|
||||
}
|
||||
|
||||
fd = open(fname, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror(fname);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (fstat(fd, &st) == -1) {
|
||||
perror("fstat");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
|
||||
if (msg == (uchar *)-1) {
|
||||
perror("mmap");
|
||||
return NULL;
|
||||
}
|
||||
close(fd);
|
||||
|
||||
sum = spamsum(msg, st.st_size, flags, block_size);
|
||||
|
||||
munmap(msg, st.st_size);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
static void show_help(void)
|
||||
{
|
||||
printf("\n\
|
||||
spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
|
||||
\n\
|
||||
spamsum computes a signature string that is particular good for detecting if two emails\n\
|
||||
are very similar. This can be used to detect SPAM.\n\
|
||||
\n\
|
||||
Syntax:\n\
|
||||
spamsum [options] <files>\n\
|
||||
or\n\
|
||||
spamsum [options] -d sigs.txt -c SIG\n\
|
||||
or\n\
|
||||
spamsum [options] -d sigs.txt -C file\n\
|
||||
\n\
|
||||
When called with a list of filenames spamsum will write out the\n\
|
||||
signatures of each file on a separate line. You can specify the\n\
|
||||
filename '-' for standard input.\n\
|
||||
\n\
|
||||
When called with the second form, spamsum will print the best score\n\
|
||||
for the given signature with the signatures in the given database. A\n\
|
||||
score of 100 means a perfect match, and a score of 0 means a complete\n\
|
||||
mismatch.\n\
|
||||
\n\
|
||||
When checking, spamsum returns 0 (success) when the message *is* spam,\n\
|
||||
1 for internal errors, and 2 for messages whose signature is not\n\
|
||||
found.\n\
|
||||
\n\
|
||||
The 3rd form is just like the second form, but you pass a file\n\
|
||||
containing a message instead of a pre-computed signature.\n\
|
||||
\n\
|
||||
Options:\n\
|
||||
-W ignore whitespace\n\
|
||||
-H skip past mail headers\n\
|
||||
-B <bsize> force a block size of bsize\n\
|
||||
-T <threshold> set the threshold above which spamsum will stop\n\
|
||||
looking (default 90)\n\
|
||||
");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
char *sum;
|
||||
extern char *optarg;
|
||||
extern int optind;
|
||||
int c;
|
||||
char *dbname = NULL;
|
||||
u32 score;
|
||||
int i;
|
||||
u32 flags = 0;
|
||||
u32 block_size = 0;
|
||||
u32 threshold = 90;
|
||||
|
||||
while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
|
||||
switch (c) {
|
||||
case 'W':
|
||||
flags |= FLAG_IGNORE_WHITESPACE;
|
||||
break;
|
||||
|
||||
case 'H':
|
||||
flags |= FLAG_IGNORE_HEADERS;
|
||||
break;
|
||||
|
||||
case 'd':
|
||||
dbname = optarg;
|
||||
break;
|
||||
|
||||
case 'B':
|
||||
block_size = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'T':
|
||||
threshold = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'c':
|
||||
if (!dbname) {
|
||||
show_help();
|
||||
exit(1);
|
||||
}
|
||||
score = spamsum_match_db(dbname, optarg,
|
||||
threshold);
|
||||
printf("%u\n", score);
|
||||
exit(score >= threshold ? 0 : 2);
|
||||
|
||||
case 'C':
|
||||
if (!dbname) {
|
||||
show_help();
|
||||
exit(1);
|
||||
}
|
||||
score = spamsum_match_db(dbname,
|
||||
spamsum_file(optarg, flags,
|
||||
block_size),
|
||||
threshold);
|
||||
printf("%u\n", score);
|
||||
exit(score >= threshold ? 0 : 2);
|
||||
|
||||
case 'h':
|
||||
default:
|
||||
show_help();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
argc -= optind;
|
||||
argv += optind;
|
||||
|
||||
if (argc == 0) {
|
||||
show_help();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* compute the spamsum on a list of files */
|
||||
for (i=0;i<argc;i++) {
|
||||
sum = spamsum_file(argv[i], flags, block_size);
|
||||
printf("%s\n", sum);
|
||||
free(sum);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in a new issue