the_silver_searcher/src/util.c

665 lines
17 KiB
C

#include <ctype.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include "config.h"
#include "util.h"
#ifdef _WIN32
#include <windows.h>
#define flockfile(x)
#define funlockfile(x)
#define getc_unlocked(x) getc(x)
#endif
#define CHECK_AND_RETURN(ptr) \
if (ptr == NULL) { \
die("Memory allocation failed."); \
} \
return ptr;
void *ag_malloc(size_t size) {
void *ptr = malloc(size);
CHECK_AND_RETURN(ptr)
}
void *ag_realloc(void *ptr, size_t size) {
void *new_ptr = realloc(ptr, size);
CHECK_AND_RETURN(new_ptr)
}
void *ag_calloc(size_t count, size_t size) {
void *ptr = calloc(count, size);
CHECK_AND_RETURN(ptr)
}
char *ag_strdup(const char *s) {
char *str = strdup(s);
CHECK_AND_RETURN(str)
}
char *ag_strndup(const char *s, size_t size) {
char *str = NULL;
#ifdef HAVE_STRNDUP
str = strndup(s, size);
CHECK_AND_RETURN(str)
#else
str = (char *)ag_malloc(size + 1);
strlcpy(str, s, size + 1);
return str;
#endif
}
void free_strings(char **strs, const size_t strs_len) {
if (strs == NULL) {
return;
}
size_t i;
for (i = 0; i < strs_len; i++) {
free(strs[i]);
}
free(strs);
}
void generate_alpha_skip(const char *find, size_t f_len, size_t skip_lookup[], const int case_sensitive) {
size_t i;
for (i = 0; i < 256; i++) {
skip_lookup[i] = f_len;
}
f_len--;
for (i = 0; i < f_len; i++) {
if (case_sensitive) {
skip_lookup[(unsigned char)find[i]] = f_len - i;
} else {
skip_lookup[(unsigned char)tolower(find[i])] = f_len - i;
skip_lookup[(unsigned char)toupper(find[i])] = f_len - i;
}
}
}
int is_prefix(const char *s, const size_t s_len, const size_t pos, const int case_sensitive) {
size_t i;
for (i = 0; pos + i < s_len; i++) {
if (case_sensitive) {
if (s[i] != s[i + pos]) {
return 0;
}
} else {
if (tolower(s[i]) != tolower(s[i + pos])) {
return 0;
}
}
}
return 1;
}
size_t suffix_len(const char *s, const size_t s_len, const size_t pos, const int case_sensitive) {
size_t i;
for (i = 0; i < pos; i++) {
if (case_sensitive) {
if (s[pos - i] != s[s_len - i - 1]) {
break;
}
} else {
if (tolower(s[pos - i]) != tolower(s[s_len - i - 1])) {
break;
}
}
}
return i;
}
void generate_find_skip(const char *find, const size_t f_len, size_t **skip_lookup, const int case_sensitive) {
size_t i;
size_t s_len;
size_t *sl = ag_malloc(f_len * sizeof(size_t));
*skip_lookup = sl;
size_t last_prefix = f_len;
for (i = last_prefix; i > 0; i--) {
if (is_prefix(find, f_len, i, case_sensitive)) {
last_prefix = i;
}
sl[i - 1] = last_prefix + (f_len - i);
}
for (i = 0; i < f_len; i++) {
s_len = suffix_len(find, f_len, i, case_sensitive);
if (find[i - s_len] != find[f_len - 1 - s_len]) {
sl[f_len - 1 - s_len] = f_len - 1 - i + s_len;
}
}
}
size_t ag_max(size_t a, size_t b) {
if (b > a) {
return b;
}
return a;
}
/* Boyer-Moore strstr */
const char *boyer_moore_strnstr(const char *s, const char *find, const size_t s_len, const size_t f_len,
const size_t alpha_skip_lookup[], const size_t *find_skip_lookup) {
ssize_t i;
size_t pos = f_len - 1;
while (pos < s_len) {
for (i = f_len - 1; i >= 0 && s[pos] == find[i]; pos--, i--) {
}
if (i < 0) {
return s + pos + 1;
}
pos += ag_max(alpha_skip_lookup[(unsigned char)s[pos]], find_skip_lookup[i]);
}
return NULL;
}
/* Copy-pasted from above. Yes I know this is bad. One day I might even fix it. */
const char *boyer_moore_strncasestr(const char *s, const char *find, const size_t s_len, const size_t f_len,
const size_t alpha_skip_lookup[], const size_t *find_skip_lookup) {
ssize_t i;
size_t pos = f_len - 1;
while (pos < s_len) {
for (i = f_len - 1; i >= 0 && tolower(s[pos]) == find[i]; pos--, i--) {
}
if (i < 0) {
return s + pos + 1;
}
pos += ag_max(alpha_skip_lookup[(unsigned char)s[pos]], find_skip_lookup[i]);
}
return NULL;
}
strncmp_fp get_strstr(enum case_behavior casing) {
strncmp_fp ag_strncmp_fp = &boyer_moore_strnstr;
if (casing == CASE_INSENSITIVE) {
ag_strncmp_fp = &boyer_moore_strncasestr;
}
return ag_strncmp_fp;
}
size_t invert_matches(const char *buf, const size_t buf_len, match_t matches[], size_t matches_len) {
size_t i;
size_t match_read_index = 0;
size_t inverted_match_count = 0;
size_t inverted_match_start = 0;
size_t last_line_end = 0;
int in_inverted_match = TRUE;
match_t next_match;
log_debug("Inverting %u matches.", matches_len);
if (matches_len > 0) {
next_match = matches[0];
} else {
next_match.start = buf_len + 1;
}
/* No matches, so the whole buffer is now a match. */
if (matches_len == 0) {
matches[0].start = 0;
matches[0].end = buf_len - 1;
return 1;
}
for (i = 0; i < buf_len; i++) {
if (i == next_match.start) {
i = next_match.end - 1;
match_read_index++;
if (match_read_index < matches_len) {
next_match = matches[match_read_index];
}
if (in_inverted_match && last_line_end > inverted_match_start) {
matches[inverted_match_count].start = inverted_match_start;
matches[inverted_match_count].end = last_line_end - 1;
inverted_match_count++;
}
in_inverted_match = FALSE;
} else if (i == buf_len - 1 && in_inverted_match) {
matches[inverted_match_count].start = inverted_match_start;
matches[inverted_match_count].end = i;
inverted_match_count++;
} else if (buf[i] == '\n') {
last_line_end = i + 1;
if (!in_inverted_match) {
inverted_match_start = last_line_end;
}
in_inverted_match = TRUE;
}
}
for (i = 0; i < matches_len; i++) {
log_debug("Inverted match %i start %i end %i.", i, matches[i].start, matches[i].end);
}
return inverted_match_count;
}
void realloc_matches(search_results_t *sr, size_t matches_spare) {
if (sr->matches_len + matches_spare < sr->matches_size) {
return;
}
/* TODO: benchmark initial size of matches. 100 may be too small/big */
sr->matches_size = sr->matches ? sr->matches_size * 2 : 100;
sr->matches = ag_realloc(sr->matches, sr->matches_size * sizeof(match_t));
}
void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opts, const int study_opts) {
const char *pcre_err = NULL;
int pcre_err_offset = 0;
*re = pcre_compile(q, pcre_opts, &pcre_err, &pcre_err_offset, NULL);
if (*re == NULL) {
die("Bad regex! pcre_compile() failed at position %i: %s\nIf you meant to search for a literal string, run ag with -Q",
pcre_err_offset,
pcre_err);
}
*re_extra = pcre_study(*re, study_opts, &pcre_err);
if (*re_extra == NULL) {
log_debug("pcre_study returned nothing useful. Error: %s", pcre_err);
}
}
/* This function is very hot. It's called on every file. */
int is_binary(const void *buf, const size_t buf_len) {
size_t suspicious_bytes = 0;
size_t total_bytes = buf_len > 512 ? 512 : buf_len;
const unsigned char *buf_c = buf;
size_t i;
if (buf_len == 0) {
return 0;
}
if (buf_len >= 3 && buf_c[0] == 0xEF && buf_c[1] == 0xBB && buf_c[2] == 0xBF) {
/* UTF-8 BOM. This isn't binary. */
return 0;
}
if (buf_len >= 5 && strncmp(buf, "%PDF-", 5) == 0) {
/* PDF. This is binary. */
return 1;
}
for (i = 0; i < total_bytes; i++) {
if (buf_c[i] == '\0') {
/* NULL char. It's binary */
return 1;
} else if ((buf_c[i] < 7 || buf_c[i] > 14) && (buf_c[i] < 32 || buf_c[i] > 127)) {
/* UTF-8 detection */
if (buf_c[i] > 193 && buf_c[i] < 224 && i + 1 < total_bytes) {
i++;
if (buf_c[i] > 127 && buf_c[i] < 192) {
continue;
}
} else if (buf_c[i] > 223 && buf_c[i] < 240 && i + 2 < total_bytes) {
i++;
if (buf_c[i] > 127 && buf_c[i] < 192 && buf_c[i + 1] > 127 && buf_c[i + 1] < 192) {
i++;
continue;
}
}
suspicious_bytes++;
/* Disk IO is so slow that it's worthwhile to do this calculation after every suspicious byte. */
/* This is true even on a 1.6Ghz Atom with an Intel 320 SSD. */
/* Read at least 32 bytes before making a decision */
if (i >= 32 && (suspicious_bytes * 100) / total_bytes > 10) {
return 1;
}
}
}
if ((suspicious_bytes * 100) / total_bytes > 10) {
return 1;
}
return 0;
}
int is_regex(const char *query) {
char regex_chars[] = {
'$',
'(',
')',
'*',
'+',
'.',
'?',
'[',
'\\',
'^',
'{',
'|',
'\0'
};
return (strpbrk(query, regex_chars) != NULL);
}
int is_fnmatch(const char *filename) {
char fnmatch_chars[] = {
'!',
'*',
'?',
'[',
']',
'\0'
};
return (strpbrk(filename, fnmatch_chars) != NULL);
}
int binary_search(const char *needle, char **haystack, int start, int end) {
int mid;
int rc;
if (start == end) {
return -1;
}
mid = start + ((end - start) / 2);
rc = strcmp(needle, haystack[mid]);
if (rc < 0) {
return binary_search(needle, haystack, start, mid);
} else if (rc > 0) {
return binary_search(needle, haystack, mid + 1, end);
}
return mid;
}
static int wordchar_table[256];
void init_wordchar_table(void) {
int i;
for (i = 0; i < 256; ++i) {
char ch = (char)i;
wordchar_table[i] =
('a' <= ch && ch <= 'z') ||
('A' <= ch && ch <= 'Z') ||
('0' <= ch && ch <= '9') ||
ch == '_';
}
}
int is_wordchar(char ch) {
return wordchar_table[(unsigned char)ch];
}
int is_lowercase(const char *s) {
int i;
for (i = 0; s[i] != '\0'; i++) {
if (!isascii(s[i]) || isupper(s[i])) {
return FALSE;
}
}
return TRUE;
}
int is_directory(const char *path, const struct dirent *d) {
#ifdef HAVE_DIRENT_DTYPE
/* Some filesystems, e.g. ReiserFS, always return a type DT_UNKNOWN from readdir or scandir. */
/* Call stat if we don't find DT_DIR to get the information we need. */
/* Also works for symbolic links to directories. */
if (d->d_type != DT_UNKNOWN && d->d_type != DT_LNK) {
return d->d_type == DT_DIR;
}
#endif
char *full_path;
struct stat s;
ag_asprintf(&full_path, "%s/%s", path, d->d_name);
if (stat(full_path, &s) != 0) {
free(full_path);
return FALSE;
}
#ifdef _WIN32
int is_dir = GetFileAttributesA(full_path) & FILE_ATTRIBUTE_DIRECTORY;
#else
int is_dir = S_ISDIR(s.st_mode);
#endif
free(full_path);
return is_dir;
}
int is_symlink(const char *path, const struct dirent *d) {
#ifdef _WIN32
char full_path[MAX_PATH + 1] = { 0 };
sprintf(full_path, "%s\\%s", path, d->d_name);
return (GetFileAttributesA(full_path) & FILE_ATTRIBUTE_REPARSE_POINT);
#else
#ifdef HAVE_DIRENT_DTYPE
/* Some filesystems, e.g. ReiserFS, always return a type DT_UNKNOWN from readdir or scandir. */
/* Call lstat if we find DT_UNKNOWN to get the information we need. */
if (d->d_type != DT_UNKNOWN) {
return (d->d_type == DT_LNK);
}
#endif
char *full_path;
struct stat s;
ag_asprintf(&full_path, "%s/%s", path, d->d_name);
if (lstat(full_path, &s) != 0) {
free(full_path);
return FALSE;
}
free(full_path);
return S_ISLNK(s.st_mode);
#endif
}
int is_named_pipe(const char *path, const struct dirent *d) {
#ifdef HAVE_DIRENT_DTYPE
if (d->d_type != DT_UNKNOWN) {
return d->d_type == DT_FIFO;
}
#endif
char *full_path;
struct stat s;
ag_asprintf(&full_path, "%s/%s", path, d->d_name);
if (stat(full_path, &s) != 0) {
free(full_path);
return FALSE;
}
free(full_path);
return S_ISFIFO(s.st_mode);
}
void ag_asprintf(char **ret, const char *fmt, ...) {
va_list args;
va_start(args, fmt);
if (vasprintf(ret, fmt, args) == -1) {
die("vasprintf returned -1");
}
va_end(args);
}
void die(const char *fmt, ...) {
va_list args;
va_start(args, fmt);
vplog(LOG_LEVEL_ERR, fmt, args);
va_end(args);
exit(2);
}
#ifndef HAVE_FGETLN
char *fgetln(FILE *fp, size_t *lenp) {
char *buf = NULL;
int c, used = 0, len = 0;
flockfile(fp);
while ((c = getc_unlocked(fp)) != EOF) {
if (!buf || len >= used) {
size_t nsize;
char *newbuf;
nsize = used + BUFSIZ;
if (!(newbuf = realloc(buf, nsize))) {
funlockfile(fp);
if (buf)
free(buf);
return NULL;
}
buf = newbuf;
used = nsize;
}
buf[len++] = c;
if (c == '\n') {
break;
}
}
funlockfile(fp);
*lenp = len;
return buf;
}
#endif
#ifndef HAVE_GETLINE
/*
* Do it yourself getline() implementation
*/
ssize_t getline(char **lineptr, size_t *n, FILE *stream) {
size_t len = 0;
char *srcln = NULL;
char *newlnptr = NULL;
/* get line, bail on error */
if (!(srcln = fgetln(stream, &len))) {
return -1;
}
if (len >= *n) {
/* line is too big for buffer, must realloc */
/* double the buffer, bail on error */
if (!(newlnptr = realloc(*lineptr, len * 2))) {
return -1;
}
*lineptr = newlnptr;
*n = len * 2;
}
memcpy(*lineptr, srcln, len);
#ifndef HAVE_FGETLN
/* Our own implementation of fgetln() returns a malloc()d buffer that we
* must free
*/
free(srcln);
#endif
(*lineptr)[len] = '\0';
return len;
}
#endif
ssize_t buf_getline(const char **line, const char *buf, const size_t buf_len, const size_t buf_offset) {
const char *cur = buf + buf_offset;
ssize_t i;
for (i = 0; cur[i] != '\n' && (buf_offset + i < buf_len); i++) {
}
*line = cur;
return i;
}
#ifndef HAVE_REALPATH
/*
* realpath() for Windows. Turns slashes into backslashes and calls _fullpath
*/
char *realpath(const char *path, char *resolved_path) {
char *p;
char tmp[_MAX_PATH + 1];
strlcpy(tmp, path, sizeof(tmp));
p = tmp;
while (*p) {
if (*p == '/') {
*p = '\\';
}
p++;
}
return _fullpath(resolved_path, tmp, _MAX_PATH);
}
#endif
#ifndef HAVE_STRLCPY
size_t strlcpy(char *dst, const char *src, size_t size) {
char *d = dst;
const char *s = src;
size_t n = size;
/* Copy as many bytes as will fit */
if (n != 0) {
while (--n != 0) {
if ((*d++ = *s++) == '\0') {
break;
}
}
}
/* Not enough room in dst, add NUL and traverse rest of src */
if (n == 0) {
if (size != 0) {
*d = '\0'; /* NUL-terminate dst */
}
while (*s++) {
}
}
return (s - src - 1); /* count does not include NUL */
}
#endif
#ifndef HAVE_VASPRINTF
int vasprintf(char **ret, const char *fmt, va_list args) {
int rv;
*ret = NULL;
va_list args2;
/* vsnprintf can destroy args, so we need to copy it for the second call */
#ifdef __va_copy
/* non-standard macro, but usually exists */
__va_copy(args2, args);
#elif va_copy
/* C99 macro. We compile with -std=c89 but you never know */
va_copy(args2, args);
#else
/* Ancient compiler. This usually works but there are no guarantees. */
memcpy(args2, args, sizeof(va_list));
#endif
rv = vsnprintf(NULL, 0, fmt, args);
va_end(args);
if (rv < 0) {
return rv;
}
*ret = malloc(++rv); /* vsnprintf doesn't count \0 */
if (*ret == NULL) {
return -1;
}
rv = vsnprintf(*ret, rv, fmt, args2);
va_end(args2);
if (rv < 0) {
free(*ret);
}
return rv;
}
#endif