anagram/run.cc
Jeff Epler ffe196d6aa Fix "must contain" with too many of a letter
If the "must contain" (rs) set had too many uses of a character,
the initial letter counts (ww) could be nonsense.

Now, if you do a bogus search like `munmunii -- iii` (commandline)
or `munmun =iii` (web), you'll get 0 results. (it'll somewhat
confusingly report that it reached a max of 0 searches, but so
be it)

Closes #1
2025-05-01 09:47:43 +02:00

856 lines
23 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright © 2013 Jeff Epler <jepler@unpythonic.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef ANA_AS_PYMODULE
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#endif
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>
#include <endian.h>
#include <fstream>
#include <functional>
#include <inttypes.h>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <stdint.h>
#include <sys/time.h>
#include <string>
#include <unistd.h>
#include <vector>
using namespace std;
struct worddata
{
worddata()
{
m = 0;
l = 0;
memset(&c, 0, sizeof(c));
w[0] = 0;
}
operator bool() const { return m; }
uint32_t m;
unsigned char c[26];
uint16_t l;
char w[1]; // note: not declared as flexible member for Reasons
private:
/* Note: Constructor assumes that storage for word has been
allocated! Doesn't work with stacked or regular new'd storage.
*/
worddata(const char *word)
{
m = 0;
l = 0;
strcpy(w, word);
memset(&c, 0, sizeof(c));
const char *s = word;
for(;*s;s++) {
if(!isalpha(*s)) continue;
int o = tolower(*s)-'a';
l++;
c[o]++;
m |= (1<<o);
}
}
static worddata *make_word(const char *word) {
size_t sz = sizeof(worddata) + strlen(word);
char *storage = new char[sz];
worddata *r = reinterpret_cast<worddata *>(storage);
new(r) worddata(word);
return r;
}
static void delete_word(worddata *word) {
delete[] reinterpret_cast<char*>(word);
}
friend struct wordholder;
friend struct dict;
};
struct wordholder
{
wordholder() : w(worddata::make_word("")) {}
wordholder(const wordholder &h) : w(worddata::make_word(h.w->w)) {}
wordholder(const char *s) : w(worddata::make_word(s)) {}
wordholder(const string &s)
: w(worddata::make_word(s.c_str())) {}
~wordholder() {
worddata::delete_word(w);
}
const worddata &value() const { return *w; }
worddata &value() { return *w; }
private:
worddata *w;
wordholder &operator=(const wordholder &o) = delete;
};
inline size_t lcnt(const struct worddata &w)
{
return w.l;
}
inline bool candidate(const worddata &a, const worddata &b)
{
if((a.m & b.m) != b.m) return false;
for(int i=0; i<26; i++)
if(a.c[i] < b.c[i]) return false;
return true;
}
inline worddata operator-(const worddata &a, const worddata &b)
{
worddata r;
for(int i=0; i<26; i++)
{
unsigned char tmp;
r.c[i] = tmp = a.c[i] - b.c[i];
if(tmp) {
r.m |= (1<<i);
}
}
r.l = a.l - b.l;
return r;
}
#if !ANA_JS
template<class T>
void bwrite(ostream &o, const T &t) {
o.write(reinterpret_cast<const char *>(&t), sizeof(t));
}
template<class T>
void bwrite(ostream &o, const T *t, size_t n) {
o.write(reinterpret_cast<const char *>(t), sizeof(T)*n);
}
void bwrite_be32(ostream &o, const uint32_t t) {
uint32_t t_be32 = htobe32(t);
bwrite(o, t_be32);
}
template<class T>
void bread(istream &o, T &t) {
o.read(reinterpret_cast<char *>(&t), sizeof(t));
}
template<class T>
void bread(istream &o, T *t, size_t n) {
o.read(reinterpret_cast<char *>(t), sizeof(T)*n);
}
void bread_be32(istream &o, uint32_t &t) {
uint32_t t_be32;
bread(o, t_be32);
t = htobe32(t_be32);
}
inline bool ascii(const string &s)
{
for(string::size_type i=0; i != s.size(); i++)
if(!isascii(s[i])) return false;
return true;
}
#endif
struct dict {
struct byinvwordlen {
byinvwordlen(const dict &d) : d(d) {}
bool operator()(size_t a, size_t b) const {
return lcnt(*reinterpret_cast<const worddata*>(&d.wdata[a]))
> lcnt(*reinterpret_cast<const worddata*>(&d.wdata[b]));
}
const dict &d;
};
vector<uint32_t> woff;
vector<char> wdata;
const worddata *getword(size_t i) const {
return reinterpret_cast<const worddata*>(&wdata[woff.at(i)]);
}
worddata *getword(size_t i) {
return reinterpret_cast<worddata*>(&wdata[woff.at(i)]);
}
// 16-align words
static size_t pad(size_t t) { return (t + 15) & ~size_t(15); }
size_t nwords() const { return woff.size(); }
void addword(const char *word) {
size_t i = woff.size();
size_t sz = pad(sizeof(worddata) + strlen(word));
size_t off = wdata.size();
wdata.resize(off+sz);
woff.push_back(off);
new(getword(i)) worddata(word);
}
void readdict(const char *dictpath) {
ifstream i(dictpath);
string w;
while((i >> w))
{
if(!ascii(w)) continue;
addword(w.c_str());
}
sort_me();
}
void sort_me() {
stable_sort(woff.begin(), woff.end(), byinvwordlen(*this));
}
#if !ANA_JS
static const uint32_t signature = 0x414e4144;
static const uint32_t signature2 = sizeof(uint64_t);
static const uint32_t signature_rev = 0x44414e41;
void serialize(const char *ofn) const {
ofstream o(ofn, ios::binary);
uint32_t s = signature;
bwrite_be32(o, s);
s = signature2;
bwrite_be32(o, s);
if(woff.size() > INT32_MAX) {
throw runtime_error("dictionary too big (woff)");
}
bwrite_be32(o, woff.size());
bwrite(o, &woff[0], woff.size());
if(woff.size() > INT32_MAX) {
throw runtime_error("dictionary too big (wdata)");
}
bwrite_be32(o, wdata.size());
bwrite(o, &wdata[0], wdata.size());
}
void deserialize(const char *ifn) {
ifstream i(ifn, ios::binary);
uint32_t sig;
bread_be32(i, sig);
if(sig == signature_rev)
throw runtime_error("endianness error");
else if(sig != signature)
throw runtime_error("not an anagram dictionary archive");
bread_be32(i, sig);
if(sig != signature2)
throw runtime_error("count-size error");
uint32_t sz;
bread_be32(i, sz);
woff.resize(sz);
bread(i, &*(woff.begin()), sz);
bread_be32(i, sz);
wdata.resize(sz);
bread(i, &*(wdata.begin()), sz);
}
#endif
};
void usage(const char *progname)
{
fprintf(stderr, "Usage: %s [-d text-dictionary|-D bin-dictionary]\n\t"
"[-l len1,...] [-m minlen] [-M maxlen]\n\t"
"[-a] [-c] [-s] terms... -- required...\n",
progname);
exit(1);
}
void print_stack(ostream &o, vector<worddata *> &s)
{
for(vector<worddata *>::const_iterator it = s.begin(); it != s.end(); it++)
{
o << (*it)->w;
if(it + 1 == s.end()) o << "\n";
else o << " ";
}
}
struct filterer
{
filterer(const worddata &a) : a(a) { }
bool operator()(const worddata *b) const { return candidate(a, *b); }
const worddata &a;
};
vector<size_t> parse_lengths(const char *l)
{
istringstream s(l);
vector<size_t> r;
size_t i;
while((s >> i)) {
r.push_back(i);
if(s.peek() == ',') s.get();
}
return r;
}
#include <sys/time.h>
#include <sys/resource.h>
double cputime()
{
#if defined(ANA_AS_JS)
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec * 1e-6;
#else
struct rusage u;
getrusage(RUSAGE_SELF, &u);
return (u.ru_utime.tv_sec + u.ru_utime.tv_usec * 1e-6)
+ (u.ru_stime.tv_sec + u.ru_stime.tv_usec * 1e-6);
#endif
}
struct ana_cfg {
ana_cfg()
: apos(0), just_candidates(0), minlen(3), maxlen(10),
total_matches(0), max_matches(1000), total_searches(0),
max_searches(1000000) {}
ana_cfg(bool apos, bool just_candidates, size_t minlen, size_t maxlen,
size_t max_matches,
size_t max_searches, const vector<size_t>& lengths,
const string &ws, const string &rs)
: apos(apos), just_candidates(just_candidates), minlen(minlen),
maxlen(maxlen),
total_matches(0), max_matches(max_matches), total_searches(0),
max_searches(max_searches), lengths(lengths), rs(rs), ww() {
auto wsv = wordholder(ws).value();
auto rsv = wordholder(rs).value();
if (candidate(wsv, rsv)) {
ww = wsv - rsv;
} else {
this->max_searches = 0;
}
}
bool apos, just_candidates;
size_t minlen, maxlen;
size_t total_matches, max_matches, total_searches, max_searches;
vector<size_t> lengths;
std::string rs;
worddata ww;
};
struct ana_frame {
worddata l;
vector<const worddata*> c;
vector<const worddata*>::iterator st, en;
vector<size_t>::iterator lst, len;
};
struct ana_st {
ana_cfg cfg;
double t0;
vector<ana_frame> fr;
vector<const char *> words;
};
void setup(ana_st &st, const ana_cfg &cfg, const dict &d)
{
st.cfg = cfg;
st.t0 = cputime();
st.fr.clear();
st.fr.reserve(lcnt(cfg.ww));
st.fr.push_back(ana_frame());
ana_frame &f = st.fr.back();
f.l = st.cfg.ww;
f.lst = st.cfg.lengths.begin();
f.len = st.cfg.lengths.end();
filterer fi(f.l);
for(size_t i=0; i != d.nwords(); i++)
{
const worddata *it = d.getword(i);
if(lcnt(*it) < cfg.minlen || lcnt(*it) > cfg.maxlen) continue;
if(!cfg.apos && strchr(it->w, '\'')) continue;
if(!fi(it)) continue;
f.c.push_back(it);
}
f.st = f.c.begin();
f.en = f.c.end();
st.words.clear();
if(!st.cfg.rs.empty()) st.words.push_back(st.cfg.rs.c_str());
}
bool words_to_string(const vector<const char*> words, std::string &resultline) {
resultline.clear();
for(vector<const char *>::const_iterator it = words.begin(); it != words.end(); it++)
{
if(!resultline.empty()) resultline += ' ';
resultline += *it;
}
return true;
}
bool step(ana_st &st, string &resultline) {
resultline = string();
if(st.cfg.total_matches == st.cfg.max_matches) {
ostringstream o;
o << "# Reached maximum of " << st.cfg.total_matches << " matches in " << setprecision(2) << (cputime() - st.t0) << "s";
resultline = o.str();
return false;
}
while(!st.fr.empty()) {
if(st.cfg.total_searches == st.cfg.max_searches) {
ostringstream o;
o << "# Reached maximum of " << st.cfg.total_searches << " searches in " << setprecision(2) << (cputime() - st.t0) << "s";
resultline = o.str();
return false;
}
st.cfg.total_searches ++;
ana_frame &f = st.fr.back();
if(!f.l) {
st.cfg.total_matches ++;
words_to_string(st.words, resultline);
st.words.pop_back();
st.fr.pop_back();
return true;
}
if(f.lst != f.len) {
size_t reqlen = *f.lst;
while(f.st != f.en && lcnt(**f.st) != reqlen) f.st ++;
}
if(f.st == f.en)
{
st.fr.pop_back();
st.words.pop_back();
continue;
}
if(st.cfg.just_candidates) {
st.cfg.total_matches ++;
resultline = std::string((*f.st)->w);
f.st++;
return true;
}
// guaranteed not to move, because of earlier reserve()!
assert(st.fr.size() + 1 < st.fr.capacity());
st.fr.push_back(ana_frame());
st.words.push_back((*f.st)->w);
ana_frame &nf = st.fr.back();
nf.l = f.l - **f.st;
nf.c.clear();
if(f.lst != f.len) {
copy_if(f.c.begin(), f.en, back_inserter(nf.c), filterer(nf.l));
nf.lst = f.lst + 1;
nf.len = f.len;
} else {
copy_if(f.st, f.en, back_inserter(nf.c), filterer(nf.l));
nf.lst = nf.len = f.len;
}
nf.st = nf.c.begin();
nf.en = nf.c.end();
f.st++;
}
ostringstream o;
o << "# " << st.cfg.total_matches << " matches ("
<< st.cfg.total_searches << " searches) in "
<< setprecision(2) << (cputime() - st.t0) << "s";
resultline = o.str();
return false;
}
int run(dict &d, ostream &o, ana_cfg &cfg) {
ana_st st;
setup(st, cfg, d);
while(1) {
std::string line;
bool res = step(st, line);
o << line << endl;
if(!res) break;
}
return 0;
}
size_t maxsearch = 10000000;
int run(dict &d, ostream &o, bool apos, bool just_candidates, size_t minlen, size_t maxlen, size_t maxcount, vector<size_t> &lengths, string &aw, string &rw) {
ana_cfg cfg(apos, just_candidates, minlen, maxlen, maxcount, maxsearch,
lengths, aw, rw);
return run(d, o, cfg);
}
void parse(const std::string &s, ana_cfg &st, bool def_apos, bool def_just_candidates, size_t def_minlen, size_t def_maxlen, size_t def_maxcount) {
string an;
string reqd;
bool apos = def_apos;
bool just_candidates = def_just_candidates;
size_t minlen = def_minlen;
size_t maxlen = def_maxlen;
size_t maxcount = def_maxcount;
vector<size_t> lengths;
string aw, rw;
int c;
istringstream i(s);
while((c = i.peek()) != EOF)
{
switch(c) {
case '<':
(void) i.get();
i >> maxlen; break;
case '>':
(void) i.get();
i >> minlen; break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
size_t len;
i >> len;
lengths.push_back(len);
minlen = std::min(minlen, len);
maxlen = std::max(maxlen, len);
break;
}
case '+': case '=':
{
string s;
(void) i.get();
i >> s;
if(!rw.empty()) rw = rw + " " + s;
else rw = s;
break;
}
case '-':
{
(void) i.get();
i >> maxcount;
maxcount = min(def_maxcount, maxcount);
break;
}
case '\'':
(void) i.get();
apos = !apos;
break;
case '?':
(void) i.get();
just_candidates = !just_candidates;
break;
default:
{
if(isspace(c)) {
(void) i.get();
break;
}
string s;
i >> s;
if(!aw.empty()) aw = aw + " " + s;
else aw = s;
break;
}
}
}
st.~ana_cfg();
new(&st) ana_cfg(apos, just_candidates, minlen, maxlen, maxcount, maxsearch,
lengths, aw, rw);
}
void serve(istream &i, ostream &o, dict &d, bool def_apos, bool def_just_candidates, size_t def_minlen, size_t def_maxlen, size_t def_maxcount) {
string s;
while((getline(i, s))) {
ana_cfg cfg;
parse(s, cfg, def_apos, def_just_candidates, def_minlen, def_maxlen, def_maxcount);
run(d, o, cfg);
o.put('\n'); o.flush();
}
}
#if defined(ANA_AS_PYMODULE)
struct dict_object {
PyObject_HEAD
dict d;
};
static PyTypeObject dict_type = {
.ob_base = PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "ana.anadict",
.tp_basicsize = sizeof(dict_object),
};
struct search_object {
PyObject_HEAD;
dict_object *d;
ana_st *st;
};
static PyTypeObject search_type = {
.ob_base = PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "ana.results",
.tp_basicsize = sizeof(search_object),
};
search_object *search_new(PyTypeObject *type, PyObject *args, PyObject *kw)
{
return reinterpret_cast<search_object*>(type->tp_alloc(type, 0));
}
PyObject *py_run(PyObject *self, PyObject *args, PyObject *keywds) {
dict_object *d = (dict_object*)self;
int apos = false, just_candidates = false;
Py_ssize_t minlen = 3;
Py_ssize_t maxlen = 11;
Py_ssize_t maxcount = 1000;
char *terms;
Py_ssize_t terms_sz;
static const char * kwlist[] = {
"terms", "apos", "just_candidates", "minlen", "maxlen", "maxcount",
NULL};
if(!PyArg_ParseTupleAndKeywords(args, keywds, "s#|iinnn:ana.dict.run",
const_cast<char**>(kwlist),
&terms, &terms_sz, &apos, &just_candidates,
&minlen, &maxlen, &maxcount))
return NULL;
string query(terms, terms+terms_sz);
search_object *o = search_new(&search_type, 0, 0);
o->d = d;
Py_INCREF(d);
ana_cfg cfg;
o->st = new ana_st;
parse(query, cfg, apos, just_candidates, minlen, maxlen, maxcount);
setup(*o->st, cfg, d->d);
return reinterpret_cast<PyObject*>(o);
}
static PyMethodDef dict_methods[] = {
{"run", reinterpret_cast<PyCFunction>(py_run), METH_VARARGS|METH_KEYWORDS,
"Run one anagram"},
{},
};
static PyObject *
dict_new(PyTypeObject *type, PyObject *args, PyObject *kw) {
dict_object *self = reinterpret_cast<dict_object*>(type->tp_alloc(type, 0));
new(&self->d) dict();
return (PyObject *)self;
}
static void
dict_dealloc(dict_object *self) {
self->d.~dict();
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyObject *
py_fromascii(PyObject *self, PyObject *args) {
PyObject *d = dict_new(&dict_type, NULL, NULL);
dict_object *r = reinterpret_cast<dict_object*>(d);
char *path;
if(!PyArg_ParseTuple(args, "s", &path)) return NULL;
r->d.readdict(path);
return d;
}
static PyObject *
py_frombin(PyObject *self, PyObject *args) {
PyObject *d = dict_new(&dict_type, NULL, NULL);
dict_object *r = reinterpret_cast<dict_object*>(d);
char *path;
if(!PyArg_ParseTuple(args, "s", &path)) return NULL;
r->d.deserialize(path);
return d;
}
static PyMethodDef methods[] = {
{"from_ascii", py_fromascii, METH_VARARGS, "Parse ASCII dictionary"},
{"from_binary", py_frombin, METH_VARARGS, "Parse binary dictionary"},
{},
};
static PyObject *
search_iter(PyObject *self) {
Py_INCREF(self);
return self;
}
// Note: it's up to the user to ensure no more than one thread is calling
// .next() on a specific search_object at the same time
static PyObject *
search_iternext(search_object *self) {
std::string result;
if(!self->st) return NULL;
Py_BEGIN_ALLOW_THREADS
bool res = step(*self->st, result);
if(!res) { delete self->st; self->st = NULL; }
Py_END_ALLOW_THREADS
return PyUnicode_FromStringAndSize(result.data(), result.size());
}
static void
search_dealloc(search_object *self) {
Py_XDECREF(self->d);
delete self->st;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"ana",
"Quickly compute anagrams of strings",
-1,
methods
};
PyMODINIT_FUNC
PyInit_ana(void) {
PyObject *m;
m = PyModule_Create(&moduledef);
dict_type.tp_flags = Py_TPFLAGS_DEFAULT;
dict_type.tp_new = dict_new;
dict_type.tp_dealloc = reinterpret_cast<destructor>(dict_dealloc);
dict_type.tp_methods = dict_methods;
if(PyType_Ready(&dict_type) < 0) return NULL;
search_type.tp_flags = Py_TPFLAGS_DEFAULT;
search_type.tp_new = reinterpret_cast<newfunc>(search_new);
search_type.tp_dealloc = reinterpret_cast<destructor>(search_dealloc);
search_type.tp_iter = reinterpret_cast<getiterfunc>(search_iter);
search_type.tp_iternext =
reinterpret_cast<iternextfunc>(search_iternext);
if(PyType_Ready(&search_type) < 0) return NULL;
PyModule_AddObject(m, "anadict", (PyObject *)&dict_type);
return m;
}
#elif defined(ANA_AS_JS)
#include <emscripten/bind.h>
dict d;
std::string js_run(std::string s) {
std::string result;
ana_cfg cfg;
ana_st st;
parse(s, cfg, false, false, 3, 11, 10000);
if (!d.nwords()) {
std::cerr << "# reading words\n";
d.readdict("words");
}
setup(st, cfg, d);
while(1) {
std::string line;
bool res = step(st, line);
result += line; result += "\n";
if(!res) break;
}
return result;
}
EMSCRIPTEN_BINDINGS(ana) {
emscripten::function("ana", &js_run);
}
#else
int main(int argc, char **argv)
{
const char *dictpath=0, *defdict="/usr/share/dict/words", *bindict=0;
int opt;
size_t minlen=3, maxlen=11, maxcount=1000;
bool apos=false, just_candidates=false, server=false;
vector<size_t> lengths;
string aw;
while((opt = getopt(argc, argv, "-acD:d:M:m:l:s")) != -1)
{
switch(opt)
{
case 'a': apos = !apos; break;
case 'c': just_candidates = !just_candidates; break;
case 'L': maxcount = atoi(optarg); break;
case 'D': bindict = optarg; break;
case 'd': dictpath = optarg; break;
case 'M': maxlen = atoi(optarg); break;
case 'm': minlen = atoi(optarg); break;
case 'l': lengths = parse_lengths(optarg); break;
case 's': server = !server; break;
case 1: aw += optarg; break;
default: usage(argv[0]);
}
}
dict d;
if(bindict && dictpath) {
d.readdict(dictpath);
d.serialize(bindict);
cout << "# Read and serialized " << d.nwords() << " candidate words in " << setprecision(2) << cputime() << "s\n";
return 0;
} else if(bindict) {
d.deserialize(bindict);
cout << "# Deserialized " << d.nwords() << " candidate words in " << setprecision(2) << cputime() << "s\n";
} else {
d.readdict(dictpath ? dictpath : defdict);
cout << "# Read " << d.nwords() << " candidate words in " << setprecision(2) << cputime() << "s\n";
}
if(server) {
serve(cin, cout, d, apos, just_candidates, minlen, maxlen, maxcount);
return 0;
} else {
string rw;
for(int i=optind; i<argc; i++)
{
if(rw.empty()) rw = argv[i];
else rw = rw + " " + argv[i];
}
return run(d, cout, apos, just_candidates, minlen, maxlen, maxcount, lengths, aw, rw);
}
}
#endif