commit 9efeb7d8f3928597549c02365d6d5d463d7abe3d Author: freakboy3742 Date: Tue Jan 20 10:52:53 2009 +0000 Initial code checkin diff --git a/README b/README new file mode 100644 index 0000000..915078b --- /dev/null +++ b/README @@ -0,0 +1,65 @@ +PySpamSum v1.0 +============== + +spamsum is a fuzzy hash specifically designed for hashing email messages +to detect if they are SPAM. The spamsum utility includes the ability to +generate the spamsum hash and check a new message against a existing set +of hashes to find a match. + +pyspamsum is a Python wrapper for the core API of spamsum. + +The original spamsum code has been licensed under the terms of the +the Perl Artistic License. It has been slightly modified + +The original code is Copyright Andrew Tridgell 2002. +It forms part of Andrew's junkcode, and is available here: + + http://www.samba.org/junkcode/#spamsum + +The spamsum code in this project is derived from an updated version that +was published at Linux.conf.au 2004: + + http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum + +For details on spamsum itself, please see the spamsum README: + + http://samba.org/ftp/unpacked/junkcode/spamsum/README + +This Python wrapper is released under the new BSD license, and is +Copyright Russell Keith-Magee 2009. + +Installation +------------ + +At a prompt, run: + +$ python setup.py install + +Usage +----- + +# Import spamsum and set up some strings +>>> import spamsum +>>> s1 = "I am the very model of a modern Major-General, I've information animal and vegegtable and mineral" +>>> s2 = "I am the very model of a modern Brigadier, I've information animal and vegetable and something else" +>>> s3 = "Huh? Gilbert and Who?" + +# Evaluate the edit distance between two strings +>>> spamsum.edit_distance(s1, s1) +28 + +# Evaluate the spamsum of some strings +>>> sum1 = spamsum.spamsum(s1) +>>> sum2 = spamsum.spamsum(s2) +>>> sum3 = spamsum.spamsum(s2) +>>> print sum1 +3:kEvyc/sFIKwYclQY4MKLFE4IgunfELzIKygn:kE6Ai3KQ/MKOgWf/KZn + +# Compare two spamsums. 0 = no match, 100 = perfect match. +>>> spamsum.match(s1, s1) +100 +>>> spamsum.match(s1, s2) +66 +>>> spamsum.match(s1, s3) +0 + diff --git a/edit_dist.c b/edit_dist.c new file mode 100644 index 0000000..cec2f4a --- /dev/null +++ b/edit_dist.c @@ -0,0 +1,269 @@ +/* + This edit distance code is taken from trn3.6. A few minor + modifications have been made by Andrew Tridgell + for use in spamsum. +*/ + + +/***************************************************************************/ + + +/* The authors make no claims as to the fitness or correctness of this software + * for any use whatsoever, and it is provided as is. Any use of this software + * is at the user's own risk. + */ + +#include +#include +#include + +/* edit_dist -- returns the minimum edit distance between two strings + + Program by: Mark Maimone CMU Computer Science 13 Nov 89 + Last Modified: 28 Jan 90 + + If the input strings have length n and m, the algorithm runs in time + O(nm) and space O(min(m,n)). + +HISTORY + 13 Nov 89 (mwm) Created edit_dist() and set_costs(). + + 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD + computations will work even when THRESHOLD is not a multiple of + sizeof(int). + + 17 May 93 (mwm) Improved performance when used with trn's newsgroup + processing; assume all costs are 1, and you can terminate when a + threshold is exceeded. +*/ + +#define MIN_DIST 100 + +#define TRN_SPEEDUP /* Use a less-general version of the + routine, one that's better for trn. + All change costs are 1, and it's okay + to terminate if the edit distance is + known to exceed MIN_DIST */ + +#define THRESHOLD 4000 /* worry about allocating more memory only + when this # of bytes is exceeded */ +#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2)) + +#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y)) + +#define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap) +#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap) +#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my)) +#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my)) + + +static int insert_cost = 1; +static int delete_cost = 1; +#ifndef TRN_SPEEDUP +static int change_cost = 1; +static int swap_cost = 1; +#endif + +static int _iswap; /* swap_int temp variable */ +static char *_cswap; /* swap_char temp variable */ +static int _mx, _my, _mz; /* min2, min3 temp variables */ + + + +/* edit_distn -- returns the edit distance between two strings, or -1 on + failure */ + +int +edit_distn(from, from_len, to, to_len) +char *from, *to; +register int from_len, to_len; +{ +#ifndef TRN_SPEEDUP + register int ins, del, ch; /* local copies of edit costs */ +#endif + register int row, col, index; /* dynamic programming counters */ + register int radix; /* radix for modular indexing */ +#ifdef TRN_SPEEDUP + register int low; +#endif + int *buffer; /* pointer to storage for one row + of the d.p. array */ + static int store[THRESHOLD / sizeof (int)]; + /* a small amount of static + storage, to be used when the + input strings are small enough */ + +/* Handle trivial cases when one string is empty */ + + if (from == NULL || !from_len) + if (to == NULL || !to_len) + return 0; + else + return to_len * insert_cost; + else if (to == NULL || !to_len) + return from_len * delete_cost; + +/* Initialize registers */ + + radix = 2 * from_len + 3; +#ifdef TRN_SPEEDUP +#define ins 1 +#define del 1 +#define ch 3 +#define swap_cost 5 +#else + ins = insert_cost; + del = delete_cost; + ch = change_cost; +#endif + +/* Make from short enough to fit in the static storage, if it's at all + possible */ + + if (from_len > to_len && from_len > STRLENTHRESHOLD) { + swap_int(from_len, to_len); + swap_char(from, to); +#ifndef TRN_SPEEDUP + swap_int(ins, del); +#endif + } /* if from_len > to_len */ + +/* Allocate the array storage (from the heap if necessary) */ + + if (from_len <= STRLENTHRESHOLD) + buffer = store; + else + buffer = (int *) malloc(radix * sizeof (int)); + +/* Here's where the fun begins. We will find the minimum edit distance + using dynamic programming. We only need to store two rows of the matrix + at a time, since we always progress down the matrix. For example, + given the strings "one" and "two", and insert, delete and change costs + equal to 1: + + _ o n e + _ 0 1 2 3 + t 1 1 2 3 + w 2 2 2 3 + o 3 2 3 3 + + The dynamic programming recursion is defined as follows: + + ar(x,0) := x * insert_cost + ar(0,y) := y * delete_cost + ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change), + a(x - 1, y) + insert_cost, + a(x, y - 1) + delete_cost, + a(x - 2, y - 2) + (from[x] == to[y-1] && + from[x-1] == to[y] ? swap_cost : + infinity)) + + Since this only looks at most two rows and three columns back, we need + only store the values for the two preceeding rows. In this + implementation, we do not explicitly store the zero column, so only 2 * + from_len + 2 words are needed. However, in the implementation of the + swap_cost check, the current matrix value is used as a buffer; we + can't overwrite the earlier value until the swap_cost check has + been performed. So we use 2 * from_len + 3 elements in the buffer. +*/ + +#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \ + buffer[mod(index)])) +#define NW(x,y) ar(x, y, index + from_len + 2) +#define N(x,y) ar(x, y, index + from_len + 3) +#define W(x,y) ar(x, y, index + radix - 1) +#define NNWW(x,y) ar(x, y, index + 1) +#define mod(x) ((x) % radix) + + index = 0; + +#ifdef DEBUG_EDITDIST + printf(" "); + for (col = 0; col < from_len; col++) + printf(" %c ", from[col]); + printf("\n "); + + for (col = 0; col <= from_len; col++) + printf("%2d ", col * del); +#endif + +/* Row 0 is handled implicitly; its value at a given column is col*del. + The loop below computes the values for Row 1. At this point we know the + strings are nonempty. We also don't need to consider swap costs in row + 1. + + COMMENT: the indicies row and col below point into the STRING, so + the corresponding MATRIX indicies are row+1 and col+1. +*/ + + buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch)); +#ifdef TRN_SPEEDUP + low = buffer[mod(index + radix - 1)]; +#endif + +#ifdef DEBUG_EDITDIST + printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]); +#endif + + for (col = 1; col < from_len; col++) { + buffer[index] = min3( + col * del + ((from[col] == to[0]) ? 0 : ch), + (col + 1) * del + ins, + buffer[index - 1] + del); +#ifdef TRN_SPEEDUP + if (buffer[index] < low) + low = buffer[index]; +#endif + index++; + +#ifdef DEBUG_EDITDIST + printf("%2d ", buffer[index - 1]); +#endif + + } /* for col = 1 */ + +#ifdef DEBUG_EDITDIST + printf("\n %c %2d ", to[1], 2 * ins); +#endif + +/* Now handle the rest of the matrix */ + + for (row = 1; row < to_len; row++) { + for (col = 0; col < from_len; col++) { + buffer[index] = min3( + NW(row, col) + ((from[col] == to[row]) ? 0 : ch), + N(row, col + 1) + ins, + W(row + 1, col) + del); + if (from[col] == to[row - 1] && col > 0 && + from[col - 1] == to[row]) + buffer[index] = min2(buffer[index], + NNWW(row - 1, col - 1) + swap_cost); + +#ifdef DEBUG_EDITDIST + printf("%2d ", buffer[index]); +#endif +#ifdef TRN_SPEEDUP + if (buffer[index] < low || col == 0) + low = buffer[index]; +#endif + + index = mod(index + 1); + } /* for col = 1 */ +#ifdef DEBUG_EDITDIST + if (row < to_len - 1) + printf("\n %c %2d ", to[row+1], (row + 2) * ins); + else + printf("\n"); +#endif +#ifdef TRN_SPEEDUP + if (low > MIN_DIST) + break; +#endif + } /* for row = 1 */ + + row = buffer[mod(index + radix - 1)]; + if (buffer != store) + free((char *) buffer); + return row; +} /* edit_distn */ + diff --git a/pyspamsum.c b/pyspamsum.c new file mode 100644 index 0000000..b4dc102 --- /dev/null +++ b/pyspamsum.c @@ -0,0 +1,95 @@ +/** + * PySpamsum v1.0.0 + * + * A Python wrapper around the spamsum library written by + * Andrew Tridgell. + * + * Copyright 2009 Russell Keith-Magee + */ +#include +#include +#include +#include + +int edit_distn(char *from, int from_len, char *to, int to_len); +char *spamsum(const unsigned char *in, size_t length, unsigned int flags, unsigned int bsize); +unsigned int spamsum_match(const char *str1, const char *str2); + +PyObject *py_edit_distance(PyObject *self, PyObject *args) +{ + PyObject *result = NULL; + int distance; + + char *from, *to; + int from_len, to_len; + + if (!PyArg_ParseTuple(args, "s#s#", &from, &from_len, &to, &to_len)) + { + return NULL; + } + + distance = edit_distn(from, from_len, to, to_len); + result = Py_BuildValue("i", distance); + + return result; +} + + +PyObject *py_spamsum(PyObject *self, PyObject *args) +{ + PyObject *result = NULL; + char *sum; + + unsigned char *in; + size_t length; + unsigned int flags, bsize; + + flags = 0; + bsize = 0; + + if (!PyArg_ParseTuple(args, "s#|ii", &in, &length, &flags, &bsize)) + { + return NULL; + } + sum = spamsum(in, length, flags, bsize); + result = Py_BuildValue("s", sum); + + return result; +} + +PyObject *py_match(PyObject *self, PyObject *args) +{ + PyObject *result = NULL; + unsigned int match; + char *str1, *str2; + + if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) + { + return NULL; + } + + match = spamsum_match(str1, str2); + result = Py_BuildValue("i", match); + + return result; +} + + +static PyMethodDef methods[] = { + {"edit_distance", py_edit_distance, METH_VARARGS, + "Calculate the edit distance between two strings." + }, + {"spamsum", py_spamsum, METH_VARARGS, + "Calculate the spamsum of a string." + }, + {"match", py_match, METH_VARARGS, + "Given two spamsum strings return a value indicating the degree to which they match." + }, + {NULL, NULL, 0, NULL} +}; + +PyMODINIT_FUNC +initspamsum() +{ + (void) Py_InitModule("spamsum", methods); +} \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..ed4bf57 --- /dev/null +++ b/setup.py @@ -0,0 +1,55 @@ +from distutils.core import setup, Extension + +setup(name = "spamsum", + version = "1.0.0", + author = "Russell Keith-Magee", + author_email = "russell@keith-magee.com", + url = 'http://code.google.com/p/pyspamsum/', + license = "New BSD", + classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + 'Topic :: Text Processing', + 'Topic :: Utilities', + ], + platforms = ["any"], + description = "A Python wrapper for Andrew Tridgell's spamsum algorithm", + long_description = """ +spamsum is a fuzzy hash specifically designed for hashing email messages +to detect if they are SPAM. The spamsum utility includes the ability to +generate the spamsum hash and check a new message against a existing set +of hashes to find a match. + +pyspamsum is a Python wrapper for the core API of spamsum. + +The original spamsum code has been licensed under the terms of the +the Perl Artistic License. It has been slightly modified + +The original code is Copyright Andrew Tridgell 2002. +It forms part of Andrew's junkcode, and is available here: + + http://www.samba.org/junkcode/#spamsum + +The spamsum code in this project is derived from an updated version that +was published at Linux.conf.au 2004: + + http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum + +For details on spamsum itself, please see the spamsum README: + + http://samba.org/ftp/unpacked/junkcode/spamsum/README +""", + classifiers = [ + + ], + ext_modules = [ + Extension( + "spamsum", [ + "pyspamsum.c", + "spamsum.c", + "edit_dist.c", + ] + ) + ] +) diff --git a/spamsum.c b/spamsum.c new file mode 100644 index 0000000..d891904 --- /dev/null +++ b/spamsum.c @@ -0,0 +1,679 @@ +/* + this is a checksum routine that is specifically designed for spam. + Copyright Andrew Tridgell 2002 + + This code is released under the GNU General Public License version 2 + or later. Alteratively, you may also use this code under the terms + of the Perl Artistic license. + + If you wish to distribute this code under the terms of a different + free software license then please ask me. If there is a good reason + then I will probably say yes. + + --- + + Modified by Russell Keith-Magee, 20 Jan 2009: + * removed the condition preventing comparison of small block sizes + (lines 364-366) + * Modified the help string to be legal cross platform C. +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* the output is a string of length 64 in base64 */ +#define SPAMSUM_LENGTH 64 + +#define MIN_BLOCKSIZE 3 +#define HASH_PRIME 0x01000193 +#define HASH_INIT 0x28021967 + +#define ROLLING_WINDOW 7 + +#ifndef MIN +#define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef MAX +#define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + +typedef unsigned u32; +typedef unsigned char uchar; + +#define FLAG_IGNORE_WHITESPACE 1 +#define FLAG_IGNORE_HEADERS 2 + +static struct { + uchar window[ROLLING_WINDOW]; + u32 h1, h2, h3; + u32 n; +} roll_state; + +/* + a rolling hash, based on the Adler checksum. By using a rolling hash + we can perform auto resynchronisation after inserts/deletes + + internally, h1 is the sum of the bytes in the window and h2 + is the sum of the bytes times the index + + h3 is a shift/xor based rolling hash, and is mostly needed to ensure that + we can cope with large blocksize values +*/ +static inline u32 roll_hash(uchar c) +{ + roll_state.h2 -= roll_state.h1; + roll_state.h2 += ROLLING_WINDOW * c; + + roll_state.h1 += c; + roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW]; + + roll_state.window[roll_state.n % ROLLING_WINDOW] = c; + roll_state.n++; + + roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF; + roll_state.h3 ^= c; + + return roll_state.h1 + roll_state.h2 + roll_state.h3; +} + +/* + reset the state of the rolling hash and return the initial rolling hash value +*/ +static u32 roll_reset(void) +{ + memset(&roll_state, 0, sizeof(roll_state)); + return 0; +} + +/* a simple non-rolling hash, based on the FNV hash */ +static inline u32 sum_hash(uchar c, u32 h) +{ + h *= HASH_PRIME; + h ^= c; + return h; +} + +/* + take a message of length 'length' and return a string representing a hash of that message, + prefixed by the selected blocksize +*/ +char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize) +{ + const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + char *ret, *p; + u32 total_chars; + u32 h, h2, h3; + u32 j, n, i, k; + u32 block_size; + uchar ret2[SPAMSUM_LENGTH/2 + 1]; + + /* if we are ignoring email headers then skip past them now */ + if (flags & FLAG_IGNORE_HEADERS) { + const uchar *s = strstr(in, "\n\n"); + if (s) { + length -= (s+2 - in); + in = s+2; + } + } + + if (flags & FLAG_IGNORE_WHITESPACE) { + /* count the non-ignored chars */ + for (n=0, i=0; i MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) { + block_size = block_size / 2; + goto again; + } + + return ret; +} + + +/* + we only accept a match if we have at least one common substring in + the signature of length ROLLING_WINDOW. This dramatically drops the + false positive rate for low score thresholds while having + negligable affect on the rate of spam detection. + + return 1 if the two strings do have a common substring, 0 otherwise +*/ +static int has_common_substring(const char *s1, const char *s2) +{ + int i, j; + int num_hashes; + u32 hashes[SPAMSUM_LENGTH]; + + /* there are many possible algorithms for common substring + detection. In this case I am re-using the rolling hash code + to act as a filter for possible substring matches */ + + roll_reset(); + memset(hashes, 0, sizeof(hashes)); + + /* first compute the windowed rolling hash at each offset in + the first string */ + for (i=0;s1[i];i++) { + hashes[i] = roll_hash((uchar)s1[i]); + } + num_hashes = i; + + roll_reset(); + + /* now for each offset in the second string compute the + rolling hash and compare it to all of the rolling hashes + for the first string. If one matches then we have a + candidate substring match. We then confirm that match with + a direct string comparison */ + for (i=0;s2[i];i++) { + u32 h = roll_hash((uchar)s2[i]); + if (i < ROLLING_WINDOW-1) continue; + for (j=ROLLING_WINDOW-1;j= ROLLING_WINDOW && + strncmp(s2+i-(ROLLING_WINDOW-1), + s1+j-(ROLLING_WINDOW-1), + ROLLING_WINDOW) == 0) { + return 1; + } + } + } + } + + return 0; +} + + +/* + eliminate sequences of longer than 3 identical characters. These + sequences contain very little information so they tend to just bias + the result unfairly +*/ +static char *eliminate_sequences(const char *str) +{ + char *ret; + int i, j, len; + + ret = strdup(str); + if (!ret) return NULL; + + len = strlen(str); + + for (i=j=3;i SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) { + /* not a real spamsum signature? */ + return 0; + } + + /* the two strings must have a common substring of length + ROLLING_WINDOW to be candidates */ + if (has_common_substring(s1, s2) == 0) { + return 0; + } + + /* compute the edit distance between the two strings. The edit distance gives + us a pretty good idea of how closely related the two strings are */ + score = edit_distn(s1, len1, s2, len2); + + /* scale the edit distance by the lengths of the two + strings. This changes the score to be a measure of the + proportion of the message that has changed rather than an + absolute quantity. It also copes with the variability of + the string lengths. */ + score = (score * SPAMSUM_LENGTH) / (len1 + len2); + + /* at this stage the score occurs roughly on a 0-64 scale, + * with 0 being a good match and 64 being a complete + * mismatch */ + + /* rescale to a 0-100 scale (friendlier to humans) */ + score = (100 * score) / 64; + + /* it is possible to get a score above 100 here, but it is a + really terrible match */ + if (score >= 100) return 0; + + /* now re-scale on a 0-100 scale with 0 being a poor match and + 100 being a excellent match. */ + score = 100 - score; + + /* when the blocksize is small we may not want to exaggerate the match size */ + // if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) { + // score = block_size/MIN_BLOCKSIZE * MIN(len1, len2); + // } + + return score; +} + +/* + given two spamsum strings return a value indicating the degree to which they match. +*/ +u32 spamsum_match(const char *str1, const char *str2) +{ + u32 block_size1, block_size2; + u32 score = 0; + char *s1, *s2; + char *s1_1, *s1_2; + char *s2_1, *s2_2; + + /* each spamsum is prefixed by its block size */ + if (sscanf(str1, "%u:", &block_size1) != 1 || + sscanf(str2, "%u:", &block_size2) != 1) { + return 0; + } + + /* if the blocksizes don't match then we are comparing + apples to oranges ... */ + if (block_size1 != block_size2 && + block_size1 != block_size2*2 && + block_size2 != block_size1*2) { + return 0; + } + + /* move past the prefix */ + str1 = strchr(str1, ':'); + str2 = strchr(str2, ':'); + + if (!str1 || !str2) { + /* badly formed ... */ + return 0; + } + + /* there is very little information content is sequences of + the same character like 'LLLLL'. Eliminate any sequences + longer than 3. This is especially important when combined + with the has_common_substring() test below. */ + s1 = eliminate_sequences(str1+1); + s2 = eliminate_sequences(str2+1); + + if (!s1 || !s2) return -4; + + /* now break them into the two pieces */ + s1_1 = s1; + s2_1 = s2; + + s1_2 = strchr(s1, ':'); + s2_2 = strchr(s2, ':'); + + if (!s1_2 || !s2_2) { + /* a signature is malformed - it doesn't have 2 parts */ + free(s1); free(s2); + return 0; + } + + *s1_2++ = 0; + *s2_2++ = 0; + + /* each signature has a string for two block sizes. We now + choose how to combine the two block sizes. We checked above + that they have at least one block size in common */ + if (block_size1 == block_size2) { + u32 score1, score2; + score1 = score_strings(s1_1, s2_1, block_size1); + score2 = score_strings(s1_2, s2_2, block_size2); + score = MAX(score1, score2); + } else if (block_size1 == block_size2*2) { + score = score_strings(s1_1, s2_2, block_size1); + } else { + score = score_strings(s1_2, s2_1, block_size2); + } + + free(s1); + free(s2); + + return score; +} + +/* + return the maximum match for a file containing a list of spamsums +*/ +u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold) +{ + FILE *f; + char line[100]; + u32 best = 0; + + f = fopen(fname, "r"); + if (!f) return 0; + + /* on each line of the database we compute the spamsum match + score. We then pick the best score */ + while (fgets(line, sizeof(line)-1, f)) { + u32 score; + int len; + len = strlen(line); + if (line[len-1] == '\n') line[len-1] = 0; + + score = spamsum_match(sum, line); + + if (score > best) { + best = score; + if (best >= threshold) break; + } + } + + fclose(f); + + return best; +} + +/* + return the spamsum on stdin +*/ +static char *spamsum_stdin(u32 flags, u32 block_size) +{ + uchar buf[10*1024]; + uchar *msg; + size_t length = 0; + int n; + char *sum; + + msg = malloc(sizeof(buf)); + if (!msg) return NULL; + + /* load the file, expanding the allocation as needed. */ + while (1) { + n = read(0, buf, sizeof(buf)); + if (n == -1 && errno == EINTR) continue; + if (n <= 0) break; + + msg = realloc(msg, length + n); + if (!msg) return NULL; + + memcpy(msg+length, buf, n); + length += n; + } + + sum = spamsum(msg, length, flags, block_size); + + free(msg); + + return sum; +} + + +/* + return the spamsum on a file +*/ +char *spamsum_file(const char *fname, u32 flags, u32 block_size) +{ + int fd; + char *sum; + struct stat st; + uchar *msg; + + if (strcmp(fname, "-") == 0) { + return spamsum_stdin(flags, block_size); + } + + fd = open(fname, O_RDONLY); + if (fd == -1) { + perror(fname); + return NULL; + } + + if (fstat(fd, &st) == -1) { + perror("fstat"); + return NULL; + } + + msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0); + if (msg == (uchar *)-1) { + perror("mmap"); + return NULL; + } + close(fd); + + sum = spamsum(msg, st.st_size, flags, block_size); + + munmap(msg, st.st_size); + + return sum; +} + +static void show_help(void) +{ + printf("\n\ +spamsum v1.1 written by Andrew Tridgell \n\ +\n\ +spamsum computes a signature string that is particular good for detecting if two emails\n\ +are very similar. This can be used to detect SPAM.\n\ +\n\ +Syntax:\n\ + spamsum [options] \n\ +or\n\ + spamsum [options] -d sigs.txt -c SIG\n\ +or\n\ + spamsum [options] -d sigs.txt -C file\n\ +\n\ +When called with a list of filenames spamsum will write out the\n\ +signatures of each file on a separate line. You can specify the\n\ +filename '-' for standard input.\n\ +\n\ +When called with the second form, spamsum will print the best score\n\ +for the given signature with the signatures in the given database. A\n\ +score of 100 means a perfect match, and a score of 0 means a complete\n\ +mismatch.\n\ +\n\ +When checking, spamsum returns 0 (success) when the message *is* spam,\n\ +1 for internal errors, and 2 for messages whose signature is not\n\ +found.\n\ +\n\ +The 3rd form is just like the second form, but you pass a file\n\ +containing a message instead of a pre-computed signature.\n\ +\n\ +Options:\n\ + -W ignore whitespace\n\ + -H skip past mail headers\n\ + -B force a block size of bsize\n\ + -T set the threshold above which spamsum will stop\n\ + looking (default 90)\n\ +"); +} + +int main(int argc, char *argv[]) +{ + char *sum; + extern char *optarg; + extern int optind; + int c; + char *dbname = NULL; + u32 score; + int i; + u32 flags = 0; + u32 block_size = 0; + u32 threshold = 90; + + while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) { + switch (c) { + case 'W': + flags |= FLAG_IGNORE_WHITESPACE; + break; + + case 'H': + flags |= FLAG_IGNORE_HEADERS; + break; + + case 'd': + dbname = optarg; + break; + + case 'B': + block_size = atoi(optarg); + break; + + case 'T': + threshold = atoi(optarg); + break; + + case 'c': + if (!dbname) { + show_help(); + exit(1); + } + score = spamsum_match_db(dbname, optarg, + threshold); + printf("%u\n", score); + exit(score >= threshold ? 0 : 2); + + case 'C': + if (!dbname) { + show_help(); + exit(1); + } + score = spamsum_match_db(dbname, + spamsum_file(optarg, flags, + block_size), + threshold); + printf("%u\n", score); + exit(score >= threshold ? 0 : 2); + + case 'h': + default: + show_help(); + exit(0); + } + } + + argc -= optind; + argv += optind; + + if (argc == 0) { + show_help(); + return 0; + } + + /* compute the spamsum on a list of files */ + for (i=0;i