diff --git a/.gitignore b/.gitignore index ba2bea8..c25bfba 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ dist build _build distribute-* +.ruff_cache/ .tox/ .vscode/ -venv/ \ No newline at end of file +venv/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..58b7ab2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-toml + - id: check-yaml + - id: check-case-conflict + - id: check-docstring-first + - id: end-of-file-fixer + - id: trailing-whitespace + # Docformatter 1.7.5 isn't compatible with Pre-commit 4.0 + # - repo: https://github.com/PyCQA/docformatter + # rev: v1.7.5 + # hooks: + # - id: docformatter + # args: [--in-place, --black] + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.6 + hooks: + # Run the linter. + - id: ruff + # Run the formatter. + - id: ruff-format + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + # remove toml extra once Python 3.10 is no longer supported + additional_dependencies: ['.[toml]'] diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 7c0c9f8..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include LICENSE -include tox.ini -recursive-include tests *.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a11398e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,86 @@ +[build-system] +requires = [ + "setuptools==75.8.0", + "setuptools_scm==8.1.0", +] +build-backend = "setuptools.build_meta" + +[project] +dynamic = ["version"] +name = "pyspamsum" +description = "A Python wrapper for Andrew Tridgell's spamsum algorithm" +readme = "README.rst" +requires-python = ">= 3.9" +authors = [ + {name="Russell Keith-Magee", email="russell@keith-magee.com"} +] +maintainers = [ + {name="Russell Keith-Magee", email="russell@keith-magee.com"} +] +keywords = [ + "spamsum", +] +license.text = "New BSD" +classifiers=[ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Text Processing", + "Topic :: Utilities", +] + +[project.optional-dependencies] +# Extras used by developers *of* briefcase are pinned to specific versions to +# ensure environment consistency. +dev = [ + "pre-commit == 4.1.0", + "pytest == 8.3.4", + "ruff == 0.9.6", + "setuptools_scm == 8.1.0", + "tox == 4.24.1", +] + +[project.urls] +Homepage = "https://github.com/freakboy3742/pyspamsum/" +Tracker = "https://github.com/freakboy3742/pyspamsum/issues" +Source = "https://github.com/freakboy3742/pyspamsum/" + +[tool.pytest.ini_options] +testpaths = ["tests"] +filterwarnings = [ + "error", +] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", +] + +[tool.ruff.lint.isort] +known-first-party = ["spamsum"] + +[tool.setuptools] +ext-modules = [ + {name="spamsum", sources=["src/pyspamsum.c", "src/spamsum.c", "src/edit_dist.c"]}, +] + +[tool.setuptools_scm] +# To enable SCM versioning, we need an empty tool configuration for setuptools_scm diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index b0ca624..0000000 --- a/setup.cfg +++ /dev/null @@ -1,11 +0,0 @@ - -[flake8] -# https://flake8.readthedocs.org/en/latest/ -exclude=\ - */.eggs/*,\ - */build/*,\ - .tox/*,\ - local/*,\ - venv* -max-complexity = 25 -max-line-length = 119 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 1070abf..0000000 --- a/setup.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python -import io - -from setuptools import setup, Extension - - -with io.open('README.rst', encoding='utf8') as readme: - long_description = readme.read() - - -setup( - name="pyspamsum", - version="1.0.5", - description="A Python wrapper for Andrew Tridgell's spamsum algorithm", - long_description=long_description, - long_description_content_type='text/x-rst', - author="Russell Keith-Magee", - author_email="russell@keith-magee.com", - url='http://github.com/freakboy3742/pyspamsum/', - license="New BSD", - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Topic :: Text Processing', - 'Topic :: Utilities', - ], - ext_modules=[ - Extension( - "spamsum", [ - "pyspamsum.c", - "spamsum.c", - "edit_dist.c", - ] - ) - ], - test_suite='tests', -) diff --git a/edit_dist.c b/src/edit_dist.c similarity index 97% rename from edit_dist.c rename to src/edit_dist.c index cec2f4a..e511fc6 100644 --- a/edit_dist.c +++ b/src/edit_dist.c @@ -159,7 +159,7 @@ register int from_len, to_len; infinity)) Since this only looks at most two rows and three columns back, we need - only store the values for the two preceeding rows. In this + only store the values for the two preceding rows. In this implementation, we do not explicitly store the zero column, so only 2 * from_len + 2 words are needed. However, in the implementation of the swap_cost check, the current matrix value is used as a buffer; we @@ -192,8 +192,8 @@ register int from_len, to_len; strings are nonempty. We also don't need to consider swap costs in row 1. - COMMENT: the indicies row and col below point into the STRING, so - the corresponding MATRIX indicies are row+1 and col+1. + COMMENT: the indices row and col below point into the STRING, so + the corresponding MATRIX indices are row+1 and col+1. */ buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch)); @@ -266,4 +266,3 @@ register int from_len, to_len; free((char *) buffer); return row; } /* edit_distn */ - diff --git a/pyspamsum.c b/src/pyspamsum.c similarity index 99% rename from pyspamsum.c rename to src/pyspamsum.c index 4eb5b0b..66e3d0b 100644 --- a/pyspamsum.c +++ b/src/pyspamsum.c @@ -156,4 +156,3 @@ initspamsum(void) return module; #endif } - diff --git a/spamsum.c b/src/spamsum.c similarity index 98% rename from spamsum.c rename to src/spamsum.c index 6ea2ff7..c12eb06 100644 --- a/spamsum.c +++ b/src/spamsum.c @@ -3,7 +3,7 @@ Copyright Andrew Tridgell 2002 This code is released under the GNU General Public License version 2 - or later. Alteratively, you may also use this code under the terms + or later. Alternatively, you may also use this code under the terms of the Perl Artistic license. If you wish to distribute this code under the terms of a different @@ -231,7 +231,7 @@ again: we only accept a match if we have at least one common substring in the signature of length ROLLING_WINDOW. This dramatically drops the false positive rate for low score thresholds while having - negligable affect on the rate of spam detection. + negligible effect on the rate of spam detection. return 1 if the two strings do have a common substring, 0 otherwise */ @@ -242,7 +242,7 @@ static int has_common_substring(const char *s1, const char *s2) u32 hashes[SPAMSUM_LENGTH]; /* there are many possible algorithms for common substring - detection. In this case I am re-using the rolling hash code + detection. In this case I am reusing the rolling hash code to act as a filter for possible substring matches */ roll_reset(); @@ -676,4 +676,4 @@ int main(int argc, char *argv[]) } return 0; -} \ No newline at end of file +} diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index c03dafd..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest - -import spamsum - - -class SpamSumTest(unittest.TestCase): - def setUp(self): - self.s1 = "I am the very model of a modern Major-General, I've information animal and vegetable and mineral" - self.s2 = "I am the very model of a modern Brigadier, I've information animal and vegetable and something else" - self.s3 = "Huh? Gilbert and Who?" - - def test_edit_distance(self): - self.assertEqual(spamsum.edit_distance(self.s1, self.s2), 27) - self.assertEqual(spamsum.edit_distance(self.s2, self.s1), 27) - self.assertEqual(spamsum.edit_distance(self.s1, self.s3), 93) - self.assertEqual(spamsum.edit_distance(self.s2, self.s3), 96) - - def test_spamsum(self): - self.assertEqual( - spamsum.spamsum(self.s1), - '3:kEvyc/sFIKwYclQY4MKLFE4Igu0uLzIKygn:kE6Ai3KQ/MKOgDKZn' - ) - self.assertEqual( - spamsum.spamsum(self.s2), - '3:kEvyc/sFIKwpErXLsCTApY4MKLFE4Igu0uLzWKIAYjtn:kE6Ai3jjTU/MKOgdK9Yjt' - ) - self.assertEqual( - spamsum.spamsum(self.s3), - '3:uZ3B:uZx' - ) - - def test_match(self): - self.assertEqual( - spamsum.match(spamsum.spamsum(self.s1), spamsum.spamsum(self.s1)), - 100 - ) - self.assertEqual( - spamsum.match(spamsum.spamsum(self.s1), spamsum.spamsum(self.s2)), - 72 - ) - self.assertEqual( - spamsum.match(spamsum.spamsum(self.s2), spamsum.spamsum(self.s1)), - 72 - ) - self.assertEqual( - spamsum.match(spamsum.spamsum(self.s1), spamsum.spamsum(self.s3)), - 0 - ) - self.assertEqual( - spamsum.match(spamsum.spamsum(self.s2), spamsum.spamsum(self.s3)), - 0 - ) diff --git a/tests/test_spamsum.py b/tests/test_spamsum.py new file mode 100644 index 0000000..f094257 --- /dev/null +++ b/tests/test_spamsum.py @@ -0,0 +1,61 @@ +import pytest + +import spamsum + +GILBERT = ( + "I am the very model of a modern Major-General, " + "I've information animal and vegetable and mineral" +) +NOT_GILBERT = ( + "I am the very model of a modern Brigadier, " + "I've information animal and vegetable and something else" +) +IGNORANCE = "Huh? Gilbert and Who?" + + +@pytest.mark.parametrize( + "s1, s2, distance", + [ + (GILBERT, NOT_GILBERT, 27), + (NOT_GILBERT, GILBERT, 27), + (GILBERT, IGNORANCE, 93), + (NOT_GILBERT, IGNORANCE, 96), + ], +) +def test_edit_distance(s1, s2, distance): + assert spamsum.edit_distance(s1, s2) == distance + + +@pytest.mark.parametrize( + "value, expected", + [ + ( + GILBERT, + "3:kEvyc/sFIKwYclQY4MKLFE4Igu0uLzIKygn:kE6Ai3KQ/MKOgDKZn", + ), + ( + NOT_GILBERT, + "3:kEvyc/sFIKwpErXLsCTApY4MKLFE4Igu0uLzWKIAYjtn:kE6Ai3jjTU/MKOgdK9Yjt", + ), + ( + IGNORANCE, + "3:uZ3B:uZx", + ), + ], +) +def test_spamsum(value, expected): + assert spamsum.spamsum(value) == expected + + +@pytest.mark.parametrize( + "s1, s2, match", + [ + (GILBERT, GILBERT, 100), + (GILBERT, NOT_GILBERT, 72), + (NOT_GILBERT, GILBERT, 72), + (GILBERT, IGNORANCE, 0), + (NOT_GILBERT, IGNORANCE, 0), + ], +) +def test_match(s1, s2, match): + assert spamsum.match(spamsum.spamsum(s1), spamsum.spamsum(s2)) == match diff --git a/tox.ini b/tox.ini index 64a2fd4..f0a773c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,25 +1,15 @@ - [tox] -envlist = flake8,package-py{36,37,38,39},py{36,37,38,39} +envlist = pre-commit,py{39,310,311,312,313,314} skip_missing_interpreters = true -[testenv] -commands = - python setup.py test +[testenv:pre-commit] +package = wheel +wheel_build_env = .pkg +extras = dev +commands = pre-commit run --all-files --show-diff-on-failure --color=always -[testenv:flake8] -skip_install = True -deps = - flake8 -commands = flake8 {posargs} - -[testenv:package-py{36,37,38,39}] -skip_install = True -deps = - check_manifest - wheel - twine +[testenv:py{,39,310,311,312,313,314}] +depends = pre-commit +extras = dev commands = - check-manifest -v - python setup.py sdist bdist_wheel - python -m twine check dist/* + python -m pytest {posargs:-vv --color yes}