Initial code checkin

This commit is contained in:
freakboy3742 2009-01-20 10:52:53 +00:00
commit 9efeb7d8f3
5 changed files with 1163 additions and 0 deletions

65
README Normal file
View file

@ -0,0 +1,65 @@
PySpamSum v1.0
==============
spamsum is a fuzzy hash specifically designed for hashing email messages
to detect if they are SPAM. The spamsum utility includes the ability to
generate the spamsum hash and check a new message against a existing set
of hashes to find a match.
pyspamsum is a Python wrapper for the core API of spamsum.
The original spamsum code has been licensed under the terms of the
the Perl Artistic License. It has been slightly modified
The original code is Copyright Andrew Tridgell <tridge@samba.org> 2002.
It forms part of Andrew's junkcode, and is available here:
http://www.samba.org/junkcode/#spamsum
The spamsum code in this project is derived from an updated version that
was published at Linux.conf.au 2004:
http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
For details on spamsum itself, please see the spamsum README:
http://samba.org/ftp/unpacked/junkcode/spamsum/README
This Python wrapper is released under the new BSD license, and is
Copyright Russell Keith-Magee <russell@keith-magee.com> 2009.
Installation
------------
At a prompt, run:
$ python setup.py install
Usage
-----
# Import spamsum and set up some strings
>>> import spamsum
>>> s1 = "I am the very model of a modern Major-General, I've information animal and vegegtable and mineral"
>>> s2 = "I am the very model of a modern Brigadier, I've information animal and vegetable and something else"
>>> s3 = "Huh? Gilbert and Who?"
# Evaluate the edit distance between two strings
>>> spamsum.edit_distance(s1, s1)
28
# Evaluate the spamsum of some strings
>>> sum1 = spamsum.spamsum(s1)
>>> sum2 = spamsum.spamsum(s2)
>>> sum3 = spamsum.spamsum(s2)
>>> print sum1
3:kEvyc/sFIKwYclQY4MKLFE4IgunfELzIKygn:kE6Ai3KQ/MKOgWf/KZn
# Compare two spamsums. 0 = no match, 100 = perfect match.
>>> spamsum.match(s1, s1)
100
>>> spamsum.match(s1, s2)
66
>>> spamsum.match(s1, s3)
0

269
edit_dist.c Normal file
View file

@ -0,0 +1,269 @@
/*
This edit distance code is taken from trn3.6. A few minor
modifications have been made by Andrew Tridgell <tridge@samba.org>
for use in spamsum.
*/
/***************************************************************************/
/* The authors make no claims as to the fitness or correctness of this software
* for any use whatsoever, and it is provided as is. Any use of this software
* is at the user's own risk.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
/* edit_dist -- returns the minimum edit distance between two strings
Program by: Mark Maimone CMU Computer Science 13 Nov 89
Last Modified: 28 Jan 90
If the input strings have length n and m, the algorithm runs in time
O(nm) and space O(min(m,n)).
HISTORY
13 Nov 89 (mwm) Created edit_dist() and set_costs().
28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
computations will work even when THRESHOLD is not a multiple of
sizeof(int).
17 May 93 (mwm) Improved performance when used with trn's newsgroup
processing; assume all costs are 1, and you can terminate when a
threshold is exceeded.
*/
#define MIN_DIST 100
#define TRN_SPEEDUP /* Use a less-general version of the
routine, one that's better for trn.
All change costs are 1, and it's okay
to terminate if the edit distance is
known to exceed MIN_DIST */
#define THRESHOLD 4000 /* worry about allocating more memory only
when this # of bytes is exceeded */
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
#define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
static int insert_cost = 1;
static int delete_cost = 1;
#ifndef TRN_SPEEDUP
static int change_cost = 1;
static int swap_cost = 1;
#endif
static int _iswap; /* swap_int temp variable */
static char *_cswap; /* swap_char temp variable */
static int _mx, _my, _mz; /* min2, min3 temp variables */
/* edit_distn -- returns the edit distance between two strings, or -1 on
failure */
int
edit_distn(from, from_len, to, to_len)
char *from, *to;
register int from_len, to_len;
{
#ifndef TRN_SPEEDUP
register int ins, del, ch; /* local copies of edit costs */
#endif
register int row, col, index; /* dynamic programming counters */
register int radix; /* radix for modular indexing */
#ifdef TRN_SPEEDUP
register int low;
#endif
int *buffer; /* pointer to storage for one row
of the d.p. array */
static int store[THRESHOLD / sizeof (int)];
/* a small amount of static
storage, to be used when the
input strings are small enough */
/* Handle trivial cases when one string is empty */
if (from == NULL || !from_len)
if (to == NULL || !to_len)
return 0;
else
return to_len * insert_cost;
else if (to == NULL || !to_len)
return from_len * delete_cost;
/* Initialize registers */
radix = 2 * from_len + 3;
#ifdef TRN_SPEEDUP
#define ins 1
#define del 1
#define ch 3
#define swap_cost 5
#else
ins = insert_cost;
del = delete_cost;
ch = change_cost;
#endif
/* Make from short enough to fit in the static storage, if it's at all
possible */
if (from_len > to_len && from_len > STRLENTHRESHOLD) {
swap_int(from_len, to_len);
swap_char(from, to);
#ifndef TRN_SPEEDUP
swap_int(ins, del);
#endif
} /* if from_len > to_len */
/* Allocate the array storage (from the heap if necessary) */
if (from_len <= STRLENTHRESHOLD)
buffer = store;
else
buffer = (int *) malloc(radix * sizeof (int));
/* Here's where the fun begins. We will find the minimum edit distance
using dynamic programming. We only need to store two rows of the matrix
at a time, since we always progress down the matrix. For example,
given the strings "one" and "two", and insert, delete and change costs
equal to 1:
_ o n e
_ 0 1 2 3
t 1 1 2 3
w 2 2 2 3
o 3 2 3 3
The dynamic programming recursion is defined as follows:
ar(x,0) := x * insert_cost
ar(0,y) := y * delete_cost
ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
a(x - 1, y) + insert_cost,
a(x, y - 1) + delete_cost,
a(x - 2, y - 2) + (from[x] == to[y-1] &&
from[x-1] == to[y] ? swap_cost :
infinity))
Since this only looks at most two rows and three columns back, we need
only store the values for the two preceeding rows. In this
implementation, we do not explicitly store the zero column, so only 2 *
from_len + 2 words are needed. However, in the implementation of the
swap_cost check, the current matrix value is used as a buffer; we
can't overwrite the earlier value until the swap_cost check has
been performed. So we use 2 * from_len + 3 elements in the buffer.
*/
#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
buffer[mod(index)]))
#define NW(x,y) ar(x, y, index + from_len + 2)
#define N(x,y) ar(x, y, index + from_len + 3)
#define W(x,y) ar(x, y, index + radix - 1)
#define NNWW(x,y) ar(x, y, index + 1)
#define mod(x) ((x) % radix)
index = 0;
#ifdef DEBUG_EDITDIST
printf(" ");
for (col = 0; col < from_len; col++)
printf(" %c ", from[col]);
printf("\n ");
for (col = 0; col <= from_len; col++)
printf("%2d ", col * del);
#endif
/* Row 0 is handled implicitly; its value at a given column is col*del.
The loop below computes the values for Row 1. At this point we know the
strings are nonempty. We also don't need to consider swap costs in row
1.
COMMENT: the indicies row and col below point into the STRING, so
the corresponding MATRIX indicies are row+1 and col+1.
*/
buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
#ifdef TRN_SPEEDUP
low = buffer[mod(index + radix - 1)];
#endif
#ifdef DEBUG_EDITDIST
printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
#endif
for (col = 1; col < from_len; col++) {
buffer[index] = min3(
col * del + ((from[col] == to[0]) ? 0 : ch),
(col + 1) * del + ins,
buffer[index - 1] + del);
#ifdef TRN_SPEEDUP
if (buffer[index] < low)
low = buffer[index];
#endif
index++;
#ifdef DEBUG_EDITDIST
printf("%2d ", buffer[index - 1]);
#endif
} /* for col = 1 */
#ifdef DEBUG_EDITDIST
printf("\n %c %2d ", to[1], 2 * ins);
#endif
/* Now handle the rest of the matrix */
for (row = 1; row < to_len; row++) {
for (col = 0; col < from_len; col++) {
buffer[index] = min3(
NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
N(row, col + 1) + ins,
W(row + 1, col) + del);
if (from[col] == to[row - 1] && col > 0 &&
from[col - 1] == to[row])
buffer[index] = min2(buffer[index],
NNWW(row - 1, col - 1) + swap_cost);
#ifdef DEBUG_EDITDIST
printf("%2d ", buffer[index]);
#endif
#ifdef TRN_SPEEDUP
if (buffer[index] < low || col == 0)
low = buffer[index];
#endif
index = mod(index + 1);
} /* for col = 1 */
#ifdef DEBUG_EDITDIST
if (row < to_len - 1)
printf("\n %c %2d ", to[row+1], (row + 2) * ins);
else
printf("\n");
#endif
#ifdef TRN_SPEEDUP
if (low > MIN_DIST)
break;
#endif
} /* for row = 1 */
row = buffer[mod(index + radix - 1)];
if (buffer != store)
free((char *) buffer);
return row;
} /* edit_distn */

95
pyspamsum.c Normal file
View file

@ -0,0 +1,95 @@
/**
* PySpamsum v1.0.0
*
* A Python wrapper around the spamsum library written by
* Andrew Tridgell.
*
* Copyright 2009 Russell Keith-Magee <russell@keith-magee.com>
*/
#include <Python.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
int edit_distn(char *from, int from_len, char *to, int to_len);
char *spamsum(const unsigned char *in, size_t length, unsigned int flags, unsigned int bsize);
unsigned int spamsum_match(const char *str1, const char *str2);
PyObject *py_edit_distance(PyObject *self, PyObject *args)
{
PyObject *result = NULL;
int distance;
char *from, *to;
int from_len, to_len;
if (!PyArg_ParseTuple(args, "s#s#", &from, &from_len, &to, &to_len))
{
return NULL;
}
distance = edit_distn(from, from_len, to, to_len);
result = Py_BuildValue("i", distance);
return result;
}
PyObject *py_spamsum(PyObject *self, PyObject *args)
{
PyObject *result = NULL;
char *sum;
unsigned char *in;
size_t length;
unsigned int flags, bsize;
flags = 0;
bsize = 0;
if (!PyArg_ParseTuple(args, "s#|ii", &in, &length, &flags, &bsize))
{
return NULL;
}
sum = spamsum(in, length, flags, bsize);
result = Py_BuildValue("s", sum);
return result;
}
PyObject *py_match(PyObject *self, PyObject *args)
{
PyObject *result = NULL;
unsigned int match;
char *str1, *str2;
if (!PyArg_ParseTuple(args, "ss", &str1, &str2))
{
return NULL;
}
match = spamsum_match(str1, str2);
result = Py_BuildValue("i", match);
return result;
}
static PyMethodDef methods[] = {
{"edit_distance", py_edit_distance, METH_VARARGS,
"Calculate the edit distance between two strings."
},
{"spamsum", py_spamsum, METH_VARARGS,
"Calculate the spamsum of a string."
},
{"match", py_match, METH_VARARGS,
"Given two spamsum strings return a value indicating the degree to which they match."
},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
initspamsum()
{
(void) Py_InitModule("spamsum", methods);
}

55
setup.py Normal file
View file

@ -0,0 +1,55 @@
from distutils.core import setup, Extension
setup(name = "spamsum",
version = "1.0.0",
author = "Russell Keith-Magee",
author_email = "russell@keith-magee.com",
url = 'http://code.google.com/p/pyspamsum/',
license = "New BSD",
classifiers = [
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
'Topic :: Text Processing',
'Topic :: Utilities',
],
platforms = ["any"],
description = "A Python wrapper for Andrew Tridgell's spamsum algorithm",
long_description = """
spamsum is a fuzzy hash specifically designed for hashing email messages
to detect if they are SPAM. The spamsum utility includes the ability to
generate the spamsum hash and check a new message against a existing set
of hashes to find a match.
pyspamsum is a Python wrapper for the core API of spamsum.
The original spamsum code has been licensed under the terms of the
the Perl Artistic License. It has been slightly modified
The original code is Copyright Andrew Tridgell <tridge@samba.org> 2002.
It forms part of Andrew's junkcode, and is available here:
http://www.samba.org/junkcode/#spamsum
The spamsum code in this project is derived from an updated version that
was published at Linux.conf.au 2004:
http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
For details on spamsum itself, please see the spamsum README:
http://samba.org/ftp/unpacked/junkcode/spamsum/README
""",
classifiers = [
],
ext_modules = [
Extension(
"spamsum", [
"pyspamsum.c",
"spamsum.c",
"edit_dist.c",
]
)
]
)

679
spamsum.c Normal file
View file

@ -0,0 +1,679 @@
/*
this is a checksum routine that is specifically designed for spam.
Copyright Andrew Tridgell <tridge@samba.org> 2002
This code is released under the GNU General Public License version 2
or later. Alteratively, you may also use this code under the terms
of the Perl Artistic license.
If you wish to distribute this code under the terms of a different
free software license then please ask me. If there is a good reason
then I will probably say yes.
---
Modified by Russell Keith-Magee, 20 Jan 2009:
* removed the condition preventing comparison of small block sizes
(lines 364-366)
* Modified the help string to be legal cross platform C.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
/* the output is a string of length 64 in base64 */
#define SPAMSUM_LENGTH 64
#define MIN_BLOCKSIZE 3
#define HASH_PRIME 0x01000193
#define HASH_INIT 0x28021967
#define ROLLING_WINDOW 7
#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif
#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif
typedef unsigned u32;
typedef unsigned char uchar;
#define FLAG_IGNORE_WHITESPACE 1
#define FLAG_IGNORE_HEADERS 2
static struct {
uchar window[ROLLING_WINDOW];
u32 h1, h2, h3;
u32 n;
} roll_state;
/*
a rolling hash, based on the Adler checksum. By using a rolling hash
we can perform auto resynchronisation after inserts/deletes
internally, h1 is the sum of the bytes in the window and h2
is the sum of the bytes times the index
h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
we can cope with large blocksize values
*/
static inline u32 roll_hash(uchar c)
{
roll_state.h2 -= roll_state.h1;
roll_state.h2 += ROLLING_WINDOW * c;
roll_state.h1 += c;
roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
roll_state.n++;
roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
roll_state.h3 ^= c;
return roll_state.h1 + roll_state.h2 + roll_state.h3;
}
/*
reset the state of the rolling hash and return the initial rolling hash value
*/
static u32 roll_reset(void)
{
memset(&roll_state, 0, sizeof(roll_state));
return 0;
}
/* a simple non-rolling hash, based on the FNV hash */
static inline u32 sum_hash(uchar c, u32 h)
{
h *= HASH_PRIME;
h ^= c;
return h;
}
/*
take a message of length 'length' and return a string representing a hash of that message,
prefixed by the selected blocksize
*/
char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize)
{
const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
char *ret, *p;
u32 total_chars;
u32 h, h2, h3;
u32 j, n, i, k;
u32 block_size;
uchar ret2[SPAMSUM_LENGTH/2 + 1];
/* if we are ignoring email headers then skip past them now */
if (flags & FLAG_IGNORE_HEADERS) {
const uchar *s = strstr(in, "\n\n");
if (s) {
length -= (s+2 - in);
in = s+2;
}
}
if (flags & FLAG_IGNORE_WHITESPACE) {
/* count the non-ignored chars */
for (n=0, i=0; i<length; i++) {
if (isspace(in[i])) continue;
n++;
}
total_chars = n;
} else {
total_chars = length;
}
if (bsize == 0) {
/* guess a reasonable block size */
block_size = MIN_BLOCKSIZE;
while (block_size * SPAMSUM_LENGTH < total_chars) {
block_size = block_size * 2;
}
} else {
block_size = bsize;
}
ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
if (!ret) return NULL;
again:
/* the first part of the spamsum signature is the blocksize */
snprintf(ret, 12, "%u:", block_size);
p = ret + strlen(ret);
memset(p, 0, SPAMSUM_LENGTH+1);
memset(ret2, 0, sizeof(ret2));
k = j = 0;
h3 = h2 = HASH_INIT;
h = roll_reset();
for (i=0; i<length; i++) {
if ((flags & FLAG_IGNORE_WHITESPACE) &&
isspace(in[i])) continue;
/*
at each character we update the rolling hash and
the normal hash. When the rolling hash hits the
reset value then we emit the normal hash as a
element of the signature and reset both hashes
*/
h = roll_hash(in[i]);
h2 = sum_hash(in[i], h2);
h3 = sum_hash(in[i], h3);
if (h % block_size == (block_size-1)) {
/* we have hit a reset point. We now emit a
hash which is based on all chacaters in the
piece of the message between the last reset
point and this one */
p[j] = b64[h2 % 64];
if (j < SPAMSUM_LENGTH-1) {
/* we can have a problem with the tail
overflowing. The easiest way to
cope with this is to only reset the
second hash if we have room for
more characters in our
signature. This has the effect of
combining the last few pieces of
the message into a single piece */
h2 = HASH_INIT;
j++;
}
}
/* this produces a second signature with a block size
of block_size*2. By producing dual signatures in
this way the effect of small changes in the message
size near a block size boundary is greatly reduced. */
if (h % (block_size*2) == ((block_size*2)-1)) {
ret2[k] = b64[h3 % 64];
if (k < SPAMSUM_LENGTH/2-1) {
h3 = HASH_INIT;
k++;
}
}
}
/* if we have anything left then add it to the end. This
ensures that the last part of the message is always
considered */
if (h != 0) {
p[j] = b64[h2 % 64];
ret2[k] = b64[h3 % 64];
}
strcat(p+j, ":");
strcat(p+j, ret2);
/* our blocksize guess may have been way off - repeat if necessary */
if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
block_size = block_size / 2;
goto again;
}
return ret;
}
/*
we only accept a match if we have at least one common substring in
the signature of length ROLLING_WINDOW. This dramatically drops the
false positive rate for low score thresholds while having
negligable affect on the rate of spam detection.
return 1 if the two strings do have a common substring, 0 otherwise
*/
static int has_common_substring(const char *s1, const char *s2)
{
int i, j;
int num_hashes;
u32 hashes[SPAMSUM_LENGTH];
/* there are many possible algorithms for common substring
detection. In this case I am re-using the rolling hash code
to act as a filter for possible substring matches */
roll_reset();
memset(hashes, 0, sizeof(hashes));
/* first compute the windowed rolling hash at each offset in
the first string */
for (i=0;s1[i];i++) {
hashes[i] = roll_hash((uchar)s1[i]);
}
num_hashes = i;
roll_reset();
/* now for each offset in the second string compute the
rolling hash and compare it to all of the rolling hashes
for the first string. If one matches then we have a
candidate substring match. We then confirm that match with
a direct string comparison */
for (i=0;s2[i];i++) {
u32 h = roll_hash((uchar)s2[i]);
if (i < ROLLING_WINDOW-1) continue;
for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
if (hashes[j] != 0 && hashes[j] == h) {
/* we have a potential match - confirm it */
if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
strncmp(s2+i-(ROLLING_WINDOW-1),
s1+j-(ROLLING_WINDOW-1),
ROLLING_WINDOW) == 0) {
return 1;
}
}
}
}
return 0;
}
/*
eliminate sequences of longer than 3 identical characters. These
sequences contain very little information so they tend to just bias
the result unfairly
*/
static char *eliminate_sequences(const char *str)
{
char *ret;
int i, j, len;
ret = strdup(str);
if (!ret) return NULL;
len = strlen(str);
for (i=j=3;i<len;i++) {
if (str[i] != str[i-1] ||
str[i] != str[i-2] ||
str[i] != str[i-3]) {
ret[j++] = str[i];
}
}
ret[j] = 0;
return ret;
}
/*
this is the low level string scoring algorithm. It takes two strings
and scores them on a scale of 0-100 where 0 is a terrible match and
100 is a great match. The block_size is used to cope with very small
messages.
*/
static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
{
u32 score;
u32 len1, len2;
int edit_distn(const char *from, int from_len, const char *to, int to_len);
len1 = strlen(s1);
len2 = strlen(s2);
if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
/* not a real spamsum signature? */
return 0;
}
/* the two strings must have a common substring of length
ROLLING_WINDOW to be candidates */
if (has_common_substring(s1, s2) == 0) {
return 0;
}
/* compute the edit distance between the two strings. The edit distance gives
us a pretty good idea of how closely related the two strings are */
score = edit_distn(s1, len1, s2, len2);
/* scale the edit distance by the lengths of the two
strings. This changes the score to be a measure of the
proportion of the message that has changed rather than an
absolute quantity. It also copes with the variability of
the string lengths. */
score = (score * SPAMSUM_LENGTH) / (len1 + len2);
/* at this stage the score occurs roughly on a 0-64 scale,
* with 0 being a good match and 64 being a complete
* mismatch */
/* rescale to a 0-100 scale (friendlier to humans) */
score = (100 * score) / 64;
/* it is possible to get a score above 100 here, but it is a
really terrible match */
if (score >= 100) return 0;
/* now re-scale on a 0-100 scale with 0 being a poor match and
100 being a excellent match. */
score = 100 - score;
/* when the blocksize is small we may not want to exaggerate the match size */
// if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
// score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
// }
return score;
}
/*
given two spamsum strings return a value indicating the degree to which they match.
*/
u32 spamsum_match(const char *str1, const char *str2)
{
u32 block_size1, block_size2;
u32 score = 0;
char *s1, *s2;
char *s1_1, *s1_2;
char *s2_1, *s2_2;
/* each spamsum is prefixed by its block size */
if (sscanf(str1, "%u:", &block_size1) != 1 ||
sscanf(str2, "%u:", &block_size2) != 1) {
return 0;
}
/* if the blocksizes don't match then we are comparing
apples to oranges ... */
if (block_size1 != block_size2 &&
block_size1 != block_size2*2 &&
block_size2 != block_size1*2) {
return 0;
}
/* move past the prefix */
str1 = strchr(str1, ':');
str2 = strchr(str2, ':');
if (!str1 || !str2) {
/* badly formed ... */
return 0;
}
/* there is very little information content is sequences of
the same character like 'LLLLL'. Eliminate any sequences
longer than 3. This is especially important when combined
with the has_common_substring() test below. */
s1 = eliminate_sequences(str1+1);
s2 = eliminate_sequences(str2+1);
if (!s1 || !s2) return -4;
/* now break them into the two pieces */
s1_1 = s1;
s2_1 = s2;
s1_2 = strchr(s1, ':');
s2_2 = strchr(s2, ':');
if (!s1_2 || !s2_2) {
/* a signature is malformed - it doesn't have 2 parts */
free(s1); free(s2);
return 0;
}
*s1_2++ = 0;
*s2_2++ = 0;
/* each signature has a string for two block sizes. We now
choose how to combine the two block sizes. We checked above
that they have at least one block size in common */
if (block_size1 == block_size2) {
u32 score1, score2;
score1 = score_strings(s1_1, s2_1, block_size1);
score2 = score_strings(s1_2, s2_2, block_size2);
score = MAX(score1, score2);
} else if (block_size1 == block_size2*2) {
score = score_strings(s1_1, s2_2, block_size1);
} else {
score = score_strings(s1_2, s2_1, block_size2);
}
free(s1);
free(s2);
return score;
}
/*
return the maximum match for a file containing a list of spamsums
*/
u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
{
FILE *f;
char line[100];
u32 best = 0;
f = fopen(fname, "r");
if (!f) return 0;
/* on each line of the database we compute the spamsum match
score. We then pick the best score */
while (fgets(line, sizeof(line)-1, f)) {
u32 score;
int len;
len = strlen(line);
if (line[len-1] == '\n') line[len-1] = 0;
score = spamsum_match(sum, line);
if (score > best) {
best = score;
if (best >= threshold) break;
}
}
fclose(f);
return best;
}
/*
return the spamsum on stdin
*/
static char *spamsum_stdin(u32 flags, u32 block_size)
{
uchar buf[10*1024];
uchar *msg;
size_t length = 0;
int n;
char *sum;
msg = malloc(sizeof(buf));
if (!msg) return NULL;
/* load the file, expanding the allocation as needed. */
while (1) {
n = read(0, buf, sizeof(buf));
if (n == -1 && errno == EINTR) continue;
if (n <= 0) break;
msg = realloc(msg, length + n);
if (!msg) return NULL;
memcpy(msg+length, buf, n);
length += n;
}
sum = spamsum(msg, length, flags, block_size);
free(msg);
return sum;
}
/*
return the spamsum on a file
*/
char *spamsum_file(const char *fname, u32 flags, u32 block_size)
{
int fd;
char *sum;
struct stat st;
uchar *msg;
if (strcmp(fname, "-") == 0) {
return spamsum_stdin(flags, block_size);
}
fd = open(fname, O_RDONLY);
if (fd == -1) {
perror(fname);
return NULL;
}
if (fstat(fd, &st) == -1) {
perror("fstat");
return NULL;
}
msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
if (msg == (uchar *)-1) {
perror("mmap");
return NULL;
}
close(fd);
sum = spamsum(msg, st.st_size, flags, block_size);
munmap(msg, st.st_size);
return sum;
}
static void show_help(void)
{
printf("\n\
spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
\n\
spamsum computes a signature string that is particular good for detecting if two emails\n\
are very similar. This can be used to detect SPAM.\n\
\n\
Syntax:\n\
spamsum [options] <files>\n\
or\n\
spamsum [options] -d sigs.txt -c SIG\n\
or\n\
spamsum [options] -d sigs.txt -C file\n\
\n\
When called with a list of filenames spamsum will write out the\n\
signatures of each file on a separate line. You can specify the\n\
filename '-' for standard input.\n\
\n\
When called with the second form, spamsum will print the best score\n\
for the given signature with the signatures in the given database. A\n\
score of 100 means a perfect match, and a score of 0 means a complete\n\
mismatch.\n\
\n\
When checking, spamsum returns 0 (success) when the message *is* spam,\n\
1 for internal errors, and 2 for messages whose signature is not\n\
found.\n\
\n\
The 3rd form is just like the second form, but you pass a file\n\
containing a message instead of a pre-computed signature.\n\
\n\
Options:\n\
-W ignore whitespace\n\
-H skip past mail headers\n\
-B <bsize> force a block size of bsize\n\
-T <threshold> set the threshold above which spamsum will stop\n\
looking (default 90)\n\
");
}
int main(int argc, char *argv[])
{
char *sum;
extern char *optarg;
extern int optind;
int c;
char *dbname = NULL;
u32 score;
int i;
u32 flags = 0;
u32 block_size = 0;
u32 threshold = 90;
while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
switch (c) {
case 'W':
flags |= FLAG_IGNORE_WHITESPACE;
break;
case 'H':
flags |= FLAG_IGNORE_HEADERS;
break;
case 'd':
dbname = optarg;
break;
case 'B':
block_size = atoi(optarg);
break;
case 'T':
threshold = atoi(optarg);
break;
case 'c':
if (!dbname) {
show_help();
exit(1);
}
score = spamsum_match_db(dbname, optarg,
threshold);
printf("%u\n", score);
exit(score >= threshold ? 0 : 2);
case 'C':
if (!dbname) {
show_help();
exit(1);
}
score = spamsum_match_db(dbname,
spamsum_file(optarg, flags,
block_size),
threshold);
printf("%u\n", score);
exit(score >= threshold ? 0 : 2);
case 'h':
default:
show_help();
exit(0);
}
}
argc -= optind;
argv += optind;
if (argc == 0) {
show_help();
return 0;
}
/* compute the spamsum on a list of files */
for (i=0;i<argc;i++) {
sum = spamsum_file(argv[i], flags, block_size);
printf("%s\n", sum);
free(sum);
}
return 0;
}