Compare commits

...

1 commit

Author SHA1 Message Date
Geoff Greer
f90bf93036 Start of pcre2 stuff. Doesn't even compile yet. 2016-12-03 12:44:20 -08:00
13 changed files with 93 additions and 75 deletions

View file

@ -2,7 +2,7 @@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
bin_PROGRAMS = ag
ag_SOURCES = src/ignore.c src/ignore.h src/log.c src/log.h src/options.c src/options.h src/print.c src/print_w32.c src/print.h src/scandir.c src/scandir.h src/search.c src/search.h src/lang.c src/lang.h src/util.c src/util.h src/decompress.c src/decompress.h src/uthash.h src/main.c
ag_LDADD = ${PCRE_LIBS} ${LZMA_LIBS} ${ZLIB_LIBS} $(PTHREAD_LIBS)
ag_LDADD = ${PCRE2_LIBS} ${LZMA_LIBS} ${ZLIB_LIBS} $(PTHREAD_LIBS)
dist_man_MANS = doc/ag.1

View file

@ -34,7 +34,7 @@ There are also [graphs of performance across releases](http://geoff.greer.fm/ag/
* Files are `mmap()`ed instead of read into a buffer.
* Literal string searching uses [Boyer-Moore strstr](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm).
* Regex searching uses [PCRE's JIT compiler](http://sljit.sourceforge.net/pcre.html) (if Ag is built with PCRE >=8.21).
* Ag calls `pcre_study()` before executing the same regex on every file.
* Ag calls `pcre2_study()` before executing the same regex on every file.
* Instead of calling `fnmatch()` on every pattern in your ignore files, non-regex patterns are loaded into arrays and binary searched.
I've written several blog posts showing how I've improved performance. These include how I [added pthreads](http://geoff.greer.fm/2012/09/07/the-silver-searcher-adding-pthreads/), [wrote my own `scandir()`](http://geoff.greer.fm/2012/09/03/profiling-ag-writing-my-own-scandir/), [benchmarked every revision to find performance regressions](http://geoff.greer.fm/2012/08/25/the-silver-searcher-benchmarking-revisions/), and profiled with [gprof](http://geoff.greer.fm/2012/02/08/profiling-with-gprof/) and [Valgrind](http://geoff.greer.fm/2012/01/23/making-programs-faster-profiling/).
@ -94,23 +94,23 @@ Run the relevant [`setup-*.exe`](https://cygwin.com/install.html), and select "t
### Building master
1. Install dependencies (Automake, pkg-config, PCRE, LZMA):
1. Install dependencies (Automake, pkg-config, PCRE2, LZMA):
* OS X:
brew install automake pkg-config pcre xz
brew install automake pkg-config pcre2 xz
or
port install automake pkgconfig pcre xz
port install automake pkgconfig pcre2 xz
* Ubuntu/Debian:
apt-get install -y automake pkg-config libpcre3-dev zlib1g-dev liblzma-dev
apt-get install -y automake pkg-config libpcre2-dev zlib1g-dev liblzma-dev
* Fedora:
yum -y install pkgconfig automake gcc zlib-devel pcre-devel xz-devel
yum -y install pkgconfig automake gcc zlib-devel pcre2-devel xz-devel
* CentOS:
yum -y groupinstall "Development Tools"
yum -y install pcre-devel xz-devel
yum -y install pcre2-devel xz-devel
* Windows: It's complicated. See [this wiki page](https://github.com/ggreer/the_silver_searcher/wiki/Windows).
2. Run the build script (which just runs aclocal, automake, etc):

View file

@ -15,7 +15,8 @@ m4_ifdef(
[AM_SILENT_RULES],
[AM_SILENT_RULES([yes])])
PKG_CHECK_MODULES([PCRE], [libpcre])
PKG_CHECK_MODULES([PCRE2], [libpcre2-8])
AC_DEFINE([PCRE2_CODE_UNIT_WIDTH], [8], [Use utf8])
m4_include([m4/ax_pthread.m4])
AX_PTHREAD(
@ -24,7 +25,7 @@ AX_PTHREAD(
)
# Run CFLAGS="-pg" ./configure if you want debug symbols
CFLAGS="$CFLAGS $PTHREAD_CFLAGS $PCRE_CFLAGS -Wall -Wextra -Wformat=2 -Wno-format-nonliteral -Wshadow -Wpointer-arith -Wcast-qual -Wmissing-prototypes -Wno-missing-braces -std=gnu89 -D_GNU_SOURCE -O2"
CFLAGS="$CFLAGS $PTHREAD_CFLAGS $PCRE2_CFLAGS -Wall -Wextra -Wformat=2 -Wno-format-nonliteral -Wshadow -Wpointer-arith -Wcast-qual -Wmissing-prototypes -Wno-missing-braces -std=gnu89 -D_GNU_SOURCE -O2"
LDFLAGS="$LDFLAGS"
case $host in
@ -50,7 +51,7 @@ AS_IF([test "x$enable_lzma" != "xno"], [
PKG_CHECK_MODULES([LZMA], [liblzma])
])
AC_CHECK_DECL([PCRE_CONFIG_JIT], [AC_DEFINE([USE_PCRE_JIT], [], [Use PCRE JIT])], [], [#include <pcre.h>])
AC_CHECK_DECL([PCRE2_CONFIG_JIT], [AC_DEFINE([USE_PCRE2_JIT], [], [Use PCRE2 JIT])], [], [#include <pcre2.h>])
AC_CHECK_DECL([CPU_ZERO, CPU_SET], [AC_DEFINE([USE_CPU_SET], [], [Use CPU_SET macros])] , [], [#include <sched.h>])

View file

@ -193,7 +193,7 @@ static int ackmate_dir_match(const char *dir_name) {
return 0;
}
/* we just care about the match, not where the matches are */
return pcre_exec(opts.ackmate_dir_filter, NULL, dir_name, strlen(dir_name), 0, 0, NULL, 0);
return pcre2_match(opts.ackmate_dir_filter, dir_name, strlen(dir_name), 0, 0, NULL, NULL);
}
/* This is the hottest code in Ag. 10-15% of all execution time is spent here */

View file

@ -1,5 +1,7 @@
#include "config.h"
#include <ctype.h>
#include <pcre.h>
#include <pcre2.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
@ -9,8 +11,6 @@
#include <windows.h>
#endif
#include "config.h"
#ifdef HAVE_PTHREAD_H
#include <pthread.h>
#endif
@ -29,7 +29,7 @@ int main(int argc, char **argv) {
char **base_paths = NULL;
char **paths = NULL;
int i;
int pcre_opts = PCRE_MULTILINE;
int pcre_opts = PCRE2_MULTILINE;
int study_opts = 0;
worker_t *workers = NULL;
int workers_len;
@ -49,19 +49,22 @@ int main(int argc, char **argv) {
out_fd = stdout;
parse_options(argc, argv, &base_paths, &paths);
log_debug("PCRE Version: %s", pcre_version());
log_debug("PCRE Version: %s", pcre2_version());
if (opts.stats) {
memset(&stats, 0, sizeof(stats));
gettimeofday(&(stats.time_start), NULL);
}
#ifdef USE_PCRE_JIT
int has_jit = 0;
pcre_config(PCRE_CONFIG_JIT, &has_jit);
if (has_jit) {
study_opts |= PCRE_STUDY_JIT_COMPILE;
}
#endif
/*
TODO: call pcre2_jit_compile in compile_study
// #ifdef USE_PCRE2_JIT
// int has_jit = 0;
// pcre2_config(PCRE2_CONFIG_JIT, &has_jit);
// if (has_jit) {
// study_opts |= PCRE2_STUDY_JIT_COMPILE;
// }
// #endif
*/
#ifdef _WIN32
{
@ -123,7 +126,7 @@ int main(int argc, char **argv) {
}
} else {
if (opts.casing == CASE_INSENSITIVE) {
pcre_opts |= PCRE_CASELESS;
pcre_opts |= PCRE2_CASELESS;
}
if (opts.word_regexp) {
char *word_regexp_query;
@ -132,7 +135,7 @@ int main(int argc, char **argv) {
opts.query = word_regexp_query;
opts.query_len = strlen(opts.query);
}
compile_study(&opts.re, &opts.re_extra, opts.query, pcre_opts, study_opts);
compile_study(&opts.re, &opts.re_ctx, opts.query, pcre_opts, study_opts);
}
if (opts.search_stream) {

View file

@ -1,3 +1,5 @@
#include "config.h"
#include <errno.h>
#include <limits.h>
#include <stdarg.h>
@ -8,7 +10,6 @@
#include <sys/stat.h>
#include <unistd.h>
#include "config.h"
#include "ignore.h"
#include "lang.h"
#include "log.h"
@ -124,7 +125,7 @@ void print_version(void) {
char lzma = '-';
char zlib = '-';
#ifdef USE_PCRE_JIT
#ifdef USE_PCRE2_JIT
jit = '+';
#endif
#ifdef HAVE_LZMA_H
@ -169,24 +170,23 @@ void cleanup_options(void) {
free(opts.query);
}
pcre_free(opts.re);
if (opts.re_extra) {
/* Using pcre_free_study on pcre_extra* can segfault on some versions of PCRE */
pcre_free(opts.re_extra);
pcre2_code_free(opts.re);
if (opts.re_ctx) {
pcre2_compile_context_free(opts.re_ctx);
}
if (opts.ackmate_dir_filter) {
pcre_free(opts.ackmate_dir_filter);
pcre2_code_free(opts.ackmate_dir_filter);
}
if (opts.ackmate_dir_filter_extra) {
pcre_free(opts.ackmate_dir_filter_extra);
if (opts.ackmate_dir_filter_ctx) {
pcre2_compile_context_free(opts.ackmate_dir_filter_ctx);
}
if (opts.file_search_regex) {
pcre_free(opts.file_search_regex);
pcre2_code_free(opts.file_search_regex);
}
if (opts.file_search_regex_extra) {
pcre_free(opts.file_search_regex_extra);
if (opts.file_search_regex_ctx) {
pcre2_compile_context_free(opts.file_search_regex_ctx);
}
}
@ -504,7 +504,7 @@ void parse_options(int argc, char **argv, char **base_paths[], char **paths[]) {
break;
case 0: /* Long option */
if (strcmp(longopts[opt_index].name, "ackmate-dir-filter") == 0) {
compile_study(&opts.ackmate_dir_filter, &opts.ackmate_dir_filter_extra, optarg, 0, 0);
compile_study(&opts.ackmate_dir_filter, &opts.ackmate_dir_filter_ctx, optarg, 0, 0);
break;
} else if (strcmp(longopts[opt_index].name, "depth") == 0) {
opts.max_search_depth = atoi(optarg);
@ -587,21 +587,21 @@ void parse_options(int argc, char **argv, char **base_paths[], char **paths[]) {
if (file_search_regex) {
int pcre_opts = 0;
if (opts.casing == CASE_INSENSITIVE || (opts.casing == CASE_SMART && is_lowercase(file_search_regex))) {
pcre_opts |= PCRE_CASELESS;
pcre_opts |= PCRE2_CASELESS;
}
if (opts.word_regexp) {
char *old_file_search_regex = file_search_regex;
ag_asprintf(&file_search_regex, "\\b%s\\b", file_search_regex);
free(old_file_search_regex);
}
compile_study(&opts.file_search_regex, &opts.file_search_regex_extra, file_search_regex, pcre_opts, 0);
compile_study(&opts.file_search_regex, &opts.file_search_regex_ctx, file_search_regex, pcre_opts, 0);
free(file_search_regex);
}
if (has_filetype) {
num_exts = combine_file_extensions(ext_index, lang_num, &extensions);
lang_regex = make_lang_regex(extensions, num_exts);
compile_study(&opts.file_search_regex, &opts.file_search_regex_extra, lang_regex, 0, 0);
compile_study(&opts.file_search_regex, &opts.file_search_regex_ctx, lang_regex, 0, 0);
}
if (extensions) {

View file

@ -1,10 +1,12 @@
#ifndef OPTIONS_H
#define OPTIONS_H
#include "config.h"
#include <getopt.h>
#include <sys/stat.h>
#include <pcre.h>
#include <pcre2.h>
#define DEFAULT_AFTER_LEN 2
#define DEFAULT_BEFORE_LEN 2
@ -28,15 +30,15 @@ enum path_print_behavior {
typedef struct {
int ackmate;
pcre *ackmate_dir_filter;
pcre_extra *ackmate_dir_filter_extra;
pcre2_code *ackmate_dir_filter;
pcre2_compile_context *ackmate_dir_filter_ctx;
size_t after;
size_t before;
enum case_behavior casing;
const char *file_search_string;
int match_files;
pcre *file_search_regex;
pcre_extra *file_search_regex_extra;
pcre2_code *file_search_regex;
pcre2_compile_context *file_search_regex_ctx;
int color;
char *color_line_number;
char *color_match;
@ -64,8 +66,8 @@ typedef struct {
int print_line_numbers;
int print_long_lines; /* TODO: support this in print.c */
int passthrough;
pcre *re;
pcre_extra *re_extra;
pcre2_code *re;
pcre2_compile_context *re_ctx;
int recurse_dirs;
int search_all_files;
int skip_vcs_ignores;

View file

@ -100,8 +100,11 @@ void search_buf(const char *buf, const size_t buf_len,
} else {
int offset_vector[3];
if (opts.multiline) {
/* we just care about the match, not where the matches are */
return pcre2_match(opts.ackmate_dir_filter, dir_name, strlen(dir_name), 0, 0, NULL, NULL);
while (buf_offset < buf_len &&
(pcre_exec(opts.re, opts.re_extra, buf, buf_len, buf_offset, 0, offset_vector, 3)) >= 0) {
(pcre2_match(opts.re, buf, buf_len, buf_offset, 0, match_data)) >= 0) {
log_debug("Regex match found. File %s, offset %i bytes.", dir_full_path, offset_vector[0]);
buf_offset = offset_vector[1];
if (offset_vector[0] == offset_vector[1]) {
@ -129,7 +132,7 @@ void search_buf(const char *buf, const size_t buf_len,
}
size_t line_offset = 0;
while (line_offset < line_len) {
int rv = pcre_exec(opts.re, opts.re_extra, line, line_len, line_offset, 0, offset_vector, 3);
int rv = pcre2_match(opts.re, opts.re_ctx, line, line_len, line_offset, 0, offset_vector, 3);
if (rv < 0) {
break;
}
@ -290,11 +293,6 @@ void search_file(const char *file_full_path) {
goto cleanup;
}
if (!opts.literal && f_len > INT_MAX) {
log_err("Skipping %s: pcre_exec() can't handle files larger than %i bytes.", file_full_path, INT_MAX);
goto cleanup;
}
#ifdef _WIN32
{
HANDLE hmmap = CreateFileMapping(
@ -376,6 +374,7 @@ void *search_file_worker(void *i) {
int worker_id = *(int *)i;
log_debug("Worker %i started", worker_id);
match_data = pcre2_match_data_create_from_pattern(re, NULL);
while (TRUE) {
pthread_mutex_lock(&work_queue_mtx);
while (work_queue == NULL) {
@ -543,7 +542,7 @@ void search_dir(ignores *ig, const char *base_path, const char *path, const int
if (!is_directory(path, dir)) {
if (opts.file_search_regex) {
rc = pcre_exec(opts.file_search_regex, NULL, dir_full_path, strlen(dir_full_path),
rc = pcre2_match(opts.file_search_regex, NULL, dir_full_path, strlen(dir_full_path),
0, 0, offset_vector, 3);
if (rc < 0) { /* no match */
log_debug("Skipping %s due to file_search_regex.", dir_full_path);

View file

@ -1,11 +1,13 @@
#ifndef SEARCH_H
#define SEARCH_H
#include "config.h"
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <pcre.h>
#include <pcre2.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -17,8 +19,6 @@
#include <sys/stat.h>
#include <unistd.h>
#include "config.h"
#ifdef HAVE_PTHREAD_H
#include <pthread.h>
#endif

View file

@ -1,3 +1,5 @@
#include "config.h"
#include <ctype.h>
#include <stdarg.h>
#include <stdio.h>
@ -5,7 +7,6 @@
#include <string.h>
#include <sys/stat.h>
#include "config.h"
#include "util.h"
#ifdef _WIN32
@ -331,19 +332,23 @@ void realloc_matches(match_t **matches, size_t *matches_size, size_t matches_len
*matches = ag_realloc(*matches, *matches_size * sizeof(match_t));
}
void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opts, const int study_opts) {
void compile_study(pcre2_code **re, pcre2_compile_context **re_ctx, char *q, const uint32_t pcre_opts, const int study_opts) {
const char *pcre_err = NULL;
int pcre_err_offset = 0;
*re = pcre_compile(q, pcre_opts, &pcre_err, &pcre_err_offset, NULL);
*re = pcre2_compile(q, pcre_opts, &pcre_err, &pcre_err_offset, NULL, NULL);
if (*re == NULL) {
die("Bad regex! pcre_compile() failed at position %i: %s\nIf you meant to search for a literal string, run ag with -Q",
// TODO: use pcre2_get_error_message()
die("Bad regex! pcre2_compile() failed at position %i: %s\nIf you meant to search for a literal string, run ag with -Q",
pcre_err_offset,
pcre_err);
}
*re_extra = pcre_study(*re, study_opts, &pcre_err);
if (*re_extra == NULL) {
log_debug("pcre_study returned nothing useful. Error: %s", pcre_err);
pcre2_jit_compile(*re, pcre_opts);
*re_ctx = NULL;
*re_ctx = pcre2_match_data_create_from_pattern(*re, NULL);
// *re_ctx = pcre2_init_context(NULL);
if (*re_ctx == NULL) {
log_debug("pcre2_init_context returned nothing useful. Error: %s", pcre_err);
}
}

View file

@ -2,13 +2,14 @@
#define UTIL_H
#include <dirent.h>
#include <pcre.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include "config.h"
#include <pcre2.h>
#include "log.h"
#include "options.h"
@ -76,7 +77,7 @@ strncmp_fp get_strstr(enum case_behavior opts);
size_t invert_matches(const char *buf, const size_t buf_len, match_t matches[], size_t matches_len);
void realloc_matches(match_t **matches, size_t *matches_size, size_t matches_len);
void compile_study(pcre **re, pcre_extra **re_extra, char *q, const int pcre_opts, const int study_opts);
void compile_study(pcre2_code **re, pcre2_compile_context **re_ctx, char *q, const uint32_t pcre_opts, const int study_opts);
int is_binary(const void *buf, const size_t buf_len);

View file

@ -15,7 +15,14 @@ Search a big file:
234881024:hello7516192768
268435456:hello
Fail to regex search a big file:
Regex search a big file:
$ $TESTDIR/../../ag --nocolor --workers=1 --parallel 'hello.*' $TESTDIR/big_file.txt
ERR: Skipping */big_file.txt: pcre_exec() can't handle files larger than 2147483647 bytes. (glob)
[1]
33554432:hello1073741824
67108864:hello2147483648
100663296:hello3221225472
134217728:hello4294967296
167772160:hello5368709120
201326592:hello6442450944
234881024:hello7516192768
268435456:hello

View file

@ -12,8 +12,8 @@ URL: https://github.com/ggreer/%{name}
Source0: https://github.com/downloads/ggreer/%{name}/%{name}-%{version}.tar.gz
BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
BuildRequires: pcre-devel, xz-devel, zlib-devel
Requires: pcre, xz, zlib
BuildRequires: pcre2-devel, xz-devel, zlib-devel
Requires: pcre2, xz, zlib
%description
The Silver Searcher
@ -29,7 +29,7 @@ How is it so fast?
* Searching for literals (no regex) uses Boyer-Moore-Horspool strstr.
* Files are mmap()ed instead of read into a buffer.
* If you're building with PCRE 8.21 or greater, regex searches use the JIT compiler.
* Ag calls pcre_study() before executing the regex on a jillion files.
* Ag calls pcre2_study() before executing the regex on a jillion files.
* Instead of calling fnmatch() on every pattern in your ignore files, non-regex patterns are loaded into an array and binary searched.
* Ag uses Pthreads to take advantage of multiple CPU cores and search files in parallel.