pyspamsum/spamsum.c

/*
  this is a checksum routine that is specifically designed for spam.
  Copyright Andrew Tridgell <tridge@samba.org> 2002

  This code is released under the GNU General Public License version 2
  or later.  Alteratively, you may also use this code under the terms
  of the Perl Artistic license.

  If you wish to distribute this code under the terms of a different
  free software license then please ask me. If there is a good reason
  then I will probably say yes.

  ---

  Modified by Russell Keith-Magee, 20 Jan 2009:
  * removed the condition preventing comparison of small block sizes
      (lines 364-366)
  * Modified the help string to be legal cross platform C.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>

/* the output is a string of length 64 in base64 */
#define SPAMSUM_LENGTH 64

#define MIN_BLOCKSIZE 3
#define HASH_PRIME 0x01000193
#define HASH_INIT 0x28021967

#define ROLLING_WINDOW 7

#ifndef MIN
#define MIN(a,b) ((a)<(b)?(a):(b))
#endif

#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif

typedef unsigned u32;
typedef unsigned char uchar;

#define FLAG_IGNORE_WHITESPACE 1
#define FLAG_IGNORE_HEADERS 2

static struct {
	uchar window[ROLLING_WINDOW];
	u32 h1, h2, h3;
	u32 n;
} roll_state;

/*
  a rolling hash, based on the Adler checksum. By using a rolling hash
  we can perform auto resynchronisation after inserts/deletes

  internally, h1 is the sum of the bytes in the window and h2
  is the sum of the bytes times the index

  h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
  we can cope with large blocksize values
*/
static inline u32 roll_hash(uchar c)
{
	roll_state.h2 -= roll_state.h1;
	roll_state.h2 += ROLLING_WINDOW * c;

	roll_state.h1 += c;
	roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];

	roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
	roll_state.n++;

	roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
	roll_state.h3 ^= c;

	return roll_state.h1 + roll_state.h2 + roll_state.h3;
}

/*
  reset the state of the rolling hash and return the initial rolling hash value
*/
static u32 roll_reset(void)
{
	memset(&roll_state, 0, sizeof(roll_state));
	return 0;
}

/* a simple non-rolling hash, based on the FNV hash */
static inline u32 sum_hash(uchar c, u32 h)
{
	h *= HASH_PRIME;
	h ^= c;
	return h;
}

/*
  take a message of length 'length' and return a string representing a hash of that message,
  prefixed by the selected blocksize
*/
char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize)
{
	const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	char *ret, *p;
	u32 total_chars;
	u32 h, h2, h3;
	u32 j, n, i, k;
	u32 block_size;
	uchar ret2[SPAMSUM_LENGTH/2 + 1];

	/* if we are ignoring email headers then skip past them now */
	if (flags & FLAG_IGNORE_HEADERS) {
		const uchar *s = strstr(in, "\n\n");
		if (s) {
			length -= (s+2 - in);
			in = s+2;
		}
	}

	if (flags & FLAG_IGNORE_WHITESPACE) {
		/* count the non-ignored chars */
		for (n=0, i=0; i<length; i++) {
			if (isspace(in[i])) continue;
			n++;
		}
		total_chars = n;
	} else {
		total_chars = length;
	}

	if (bsize == 0) {
	/* guess a reasonable block size */
		block_size = MIN_BLOCKSIZE;
		while (block_size * SPAMSUM_LENGTH < total_chars) {
			block_size = block_size * 2;
		}
	} else {
		block_size = bsize;
	}

	ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
	if (!ret) return NULL;

again:
	/* the first part of the spamsum signature is the blocksize */
	snprintf(ret, 12, "%u:", block_size);
	p = ret + strlen(ret);

	memset(p, 0, SPAMSUM_LENGTH+1);
	memset(ret2, 0, sizeof(ret2));

	k = j = 0;
	h3 = h2 = HASH_INIT;
	h = roll_reset();

	for (i=0; i<length; i++) {
		if ((flags & FLAG_IGNORE_WHITESPACE) &&
		    isspace(in[i])) continue;

		/*
		   at each character we update the rolling hash and
		   the normal hash. When the rolling hash hits the
		   reset value then we emit the normal hash as a
		   element of the signature and reset both hashes
		*/
		h = roll_hash(in[i]);
		h2 = sum_hash(in[i], h2);
		h3 = sum_hash(in[i], h3);

		if (h % block_size == (block_size-1)) {
			/* we have hit a reset point. We now emit a
			   hash which is based on all chacaters in the
			   piece of the message between the last reset
			   point and this one */
			p[j] = b64[h2 % 64];
			if (j < SPAMSUM_LENGTH-1) {
				/* we can have a problem with the tail
				   overflowing. The easiest way to
				   cope with this is to only reset the
				   second hash if we have room for
				   more characters in our
				   signature. This has the effect of
				   combining the last few pieces of
				   the message into a single piece */
				h2 = HASH_INIT;
				j++;
			}
		}

		/* this produces a second signature with a block size
		   of block_size*2. By producing dual signatures in
		   this way the effect of small changes in the message
		   size near a block size boundary is greatly reduced. */
		if (h % (block_size*2) == ((block_size*2)-1)) {
			ret2[k] = b64[h3 % 64];
			if (k < SPAMSUM_LENGTH/2-1) {
				h3 = HASH_INIT;
				k++;
			}
		}
	}

	/* if we have anything left then add it to the end. This
	   ensures that the last part of the message is always
	   considered */
	if (h != 0) {
		p[j] = b64[h2 % 64];
		ret2[k] = b64[h3 % 64];
	}

	strcat(p+j, ":");
	strcat(p+j, ret2);

	/* our blocksize guess may have been way off - repeat if necessary */
	if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
		block_size = block_size / 2;
		goto again;
	}

	return ret;
}


/*
   we only accept a match if we have at least one common substring in
   the signature of length ROLLING_WINDOW. This dramatically drops the
   false positive rate for low score thresholds while having
   negligable affect on the rate of spam detection.

   return 1 if the two strings do have a common substring, 0 otherwise
*/
static int has_common_substring(const char *s1, const char *s2)
{
	int i, j;
	int num_hashes;
	u32 hashes[SPAMSUM_LENGTH];

	/* there are many possible algorithms for common substring
	   detection. In this case I am re-using the rolling hash code
	   to act as a filter for possible substring matches */

	roll_reset();
	memset(hashes, 0, sizeof(hashes));

	/* first compute the windowed rolling hash at each offset in
	   the first string */
	for (i=0;s1[i];i++) {
		hashes[i] = roll_hash((uchar)s1[i]);
	}
	num_hashes = i;

	roll_reset();

	/* now for each offset in the second string compute the
	   rolling hash and compare it to all of the rolling hashes
	   for the first string. If one matches then we have a
	   candidate substring match. We then confirm that match with
	   a direct string comparison */
	for (i=0;s2[i];i++) {
		u32 h = roll_hash((uchar)s2[i]);
		if (i < ROLLING_WINDOW-1) continue;
		for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
			if (hashes[j] != 0 && hashes[j] == h) {
				/* we have a potential match - confirm it */
				if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
				    strncmp(s2+i-(ROLLING_WINDOW-1),
					    s1+j-(ROLLING_WINDOW-1),
					    ROLLING_WINDOW) == 0) {
					return 1;
				}
			}
		}
	}

	return 0;
}


/*
  eliminate sequences of longer than 3 identical characters. These
  sequences contain very little information so they tend to just bias
  the result unfairly
*/
static char *eliminate_sequences(const char *str)
{
	char *ret;
	int i, j, len;

	ret = strdup(str);
	if (!ret) return NULL;

	len = strlen(str);

	for (i=j=3;i<len;i++) {
		if (str[i] != str[i-1] ||
		    str[i] != str[i-2] ||
		    str[i] != str[i-3]) {
			ret[j++] = str[i];
		}
	}

	ret[j] = 0;

	return ret;
}

/*
  this is the low level string scoring algorithm. It takes two strings
  and scores them on a scale of 0-100 where 0 is a terrible match and
  100 is a great match. The block_size is used to cope with very small
  messages.
*/
static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
{
	u32 score;
	u32 len1, len2;
	int edit_distn(const char *from, int from_len, const char *to, int to_len);

	len1 = strlen(s1);
	len2 = strlen(s2);

	if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
		/* not a real spamsum signature? */
		return 0;
	}

	/* the two strings must have a common substring of length
	   ROLLING_WINDOW to be candidates */
	if (has_common_substring(s1, s2) == 0) {
		return 0;
	}

	/* compute the edit distance between the two strings. The edit distance gives
	   us a pretty good idea of how closely related the two strings are */
	score = edit_distn(s1, len1, s2, len2);

	/* scale the edit distance by the lengths of the two
	   strings. This changes the score to be a measure of the
	   proportion of the message that has changed rather than an
	   absolute quantity. It also copes with the variability of
	   the string lengths. */
	score = (score * SPAMSUM_LENGTH) / (len1 + len2);

	/* at this stage the score occurs roughly on a 0-64 scale,
	 * with 0 being a good match and 64 being a complete
	 * mismatch */

	/* rescale to a 0-100 scale (friendlier to humans) */
	score = (100 * score) / 64;

	/* it is possible to get a score above 100 here, but it is a
	   really terrible match */
	if (score >= 100) return 0;

	/* now re-scale on a 0-100 scale with 0 being a poor match and
	   100 being a excellent match. */
	score = 100 - score;

    /* when the blocksize is small we may not want to exaggerate the match size */
    // if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
    //     score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
    // }

	return score;
}

/*
  given two spamsum strings return a value indicating the degree to which they match.
*/
u32 spamsum_match(const char *str1, const char *str2)
{
	u32 block_size1, block_size2;
	u32 score = 0;
	char *s1, *s2;
	char *s1_1, *s1_2;
	char *s2_1, *s2_2;

	/* each spamsum is prefixed by its block size */
	if (sscanf(str1, "%u:", &block_size1) != 1 ||
	    sscanf(str2, "%u:", &block_size2) != 1) {
		return 0;
	}

	/* if the blocksizes don't match then we are comparing
	   apples to oranges ... */
	if (block_size1 != block_size2 &&
	    block_size1 != block_size2*2 &&
	    block_size2 != block_size1*2) {
		return 0;
	}

	/* move past the prefix */
	str1 = strchr(str1, ':');
	str2 = strchr(str2, ':');

	if (!str1 || !str2) {
		/* badly formed ... */
		return 0;
	}

	/* there is very little information content is sequences of
	   the same character like 'LLLLL'. Eliminate any sequences
	   longer than 3. This is especially important when combined
	   with the has_common_substring() test below. */
	s1 = eliminate_sequences(str1+1);
	s2 = eliminate_sequences(str2+1);

	if (!s1 || !s2) return -4;

	/* now break them into the two pieces */
	s1_1 = s1;
	s2_1 = s2;

	s1_2 = strchr(s1, ':');
	s2_2 = strchr(s2, ':');

	if (!s1_2 || !s2_2) {
		/* a signature is malformed - it doesn't have 2 parts */
		free(s1); free(s2);
		return 0;
	}

	*s1_2++ = 0;
	*s2_2++ = 0;

	/* each signature has a string for two block sizes. We now
	   choose how to combine the two block sizes. We checked above
	   that they have at least one block size in common */
	if (block_size1 == block_size2) {
		u32 score1, score2;
		score1 = score_strings(s1_1, s2_1, block_size1);
		score2 = score_strings(s1_2, s2_2, block_size2);
		score = MAX(score1, score2);
	} else if (block_size1 == block_size2*2) {
		score = score_strings(s1_1, s2_2, block_size1);
	} else {
		score = score_strings(s1_2, s2_1, block_size2);
	}

	free(s1);
	free(s2);

	return score;
}

/*
  return the maximum match for a file containing a list of spamsums
*/
u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
{
	FILE *f;
	char line[100];
	u32 best = 0;

	f = fopen(fname, "r");
	if (!f) return 0;

	/* on each line of the database we compute the spamsum match
	   score. We then pick the best score */
	while (fgets(line, sizeof(line)-1, f)) {
		u32 score;
		int len;
		len = strlen(line);
		if (line[len-1] == '\n') line[len-1] = 0;

		score = spamsum_match(sum, line);

		if (score > best) {
			best = score;
			if (best >= threshold) break;
		}
	}

	fclose(f);

	return best;
}

/*
  return the spamsum on stdin
*/
static char *spamsum_stdin(u32 flags, u32 block_size)
{
	uchar buf[10*1024];
	uchar *msg;
	size_t length = 0;
	int n;
	char *sum;

	msg = malloc(sizeof(buf));
	if (!msg) return NULL;

	/* load the file, expanding the allocation as needed. */
	while (1) {
		n = read(0, buf, sizeof(buf));
		if (n == -1 && errno == EINTR) continue;
		if (n <= 0) break;

		msg = realloc(msg, length + n);
		if (!msg) return NULL;

		memcpy(msg+length, buf, n);
		length += n;
	}

	sum = spamsum(msg, length, flags, block_size);

	free(msg);

	return sum;
}


/*
  return the spamsum on a file
*/
char *spamsum_file(const char *fname, u32 flags, u32 block_size)
{
	int fd;
	char *sum;
	struct stat st;
	uchar *msg;

	if (strcmp(fname, "-") == 0) {
		return spamsum_stdin(flags, block_size);
	}

	fd = open(fname, O_RDONLY);
	if (fd == -1) {
		perror(fname);
		return NULL;
	}

	if (fstat(fd, &st) == -1) {
		perror("fstat");
		return NULL;
	}

	msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
	if (msg == (uchar *)-1) {
		perror("mmap");
		return NULL;
	}
	close(fd);

	sum = spamsum(msg, st.st_size, flags, block_size);

	munmap(msg, st.st_size);

	return sum;
}

static void show_help(void)
{
 printf("\n\
spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
\n\
spamsum computes a signature string that is particular good for detecting if two emails\n\
are very similar. This can be used to detect SPAM.\n\
\n\
Syntax:\n\
   spamsum [options] <files>\n\
or\n\
   spamsum [options] -d sigs.txt -c SIG\n\
or\n\
   spamsum [options] -d sigs.txt -C file\n\
\n\
When called with a list of filenames spamsum will write out the\n\
signatures of each file on a separate line. You can specify the\n\
filename '-' for standard input.\n\
\n\
When called with the second form, spamsum will print the best score\n\
for the given signature with the signatures in the given database. A\n\
score of 100 means a perfect match, and a score of 0 means a complete\n\
mismatch.\n\
\n\
When checking, spamsum returns 0 (success) when the message *is* spam,\n\
1 for internal errors, and 2 for messages whose signature is not\n\
found.\n\
\n\
The 3rd form is just like the second form, but you pass a file\n\
containing a message instead of a pre-computed signature.\n\
\n\
Options:\n\
   -W              ignore whitespace\n\
   -H              skip past mail headers\n\
   -B <bsize>      force a block size of bsize\n\
   -T <threshold>  set the threshold above which spamsum will stop\n\
                   looking (default 90)\n\
");
}

int main(int argc, char *argv[])
{
 char *sum;
 extern char *optarg;
 extern int optind;
 int c;
 char *dbname = NULL;
 u32 score;
 int i;
 u32 flags = 0;
 u32 block_size = 0;
 u32 threshold = 90;

 while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
     switch (c) {
     case 'W':
         flags |= FLAG_IGNORE_WHITESPACE;
         break;

     case 'H':
         flags |= FLAG_IGNORE_HEADERS;
         break;

     case 'd':
         dbname = optarg;
         break;

     case 'B':
         block_size = atoi(optarg);
         break;

     case 'T':
         threshold = atoi(optarg);
         break;

     case 'c':
         if (!dbname) {
             show_help();
             exit(1);
         }
         score = spamsum_match_db(dbname, optarg,
                      threshold);
         printf("%u\n", score);
         exit(score >= threshold ? 0 : 2);

     case 'C':
         if (!dbname) {
             show_help();
             exit(1);
         }
         score = spamsum_match_db(dbname,
                      spamsum_file(optarg, flags,
                               block_size),
                      threshold);
         printf("%u\n", score);
         exit(score >= threshold ? 0 : 2);

     case 'h':
     default:
         show_help();
         exit(0);
     }
 }

 argc -= optind;
 argv += optind;

 if (argc == 0) {
     show_help();
     return 0;
 }

 /* compute the spamsum on a list of files */
 for (i=0;i<argc;i++) {
     sum = spamsum_file(argv[i], flags, block_size);
     printf("%s\n", sum);
     free(sum);
 }

 return 0;
}