From 6c57ab85c5ddca3e87a16af097e32621102097d8 Mon Sep 17 00:00:00 2001 From: ucko Date: Thu, 18 Jul 2024 18:33:56 +0000 Subject: [PATCH] [PATCH] Allow substituting PCRE2 for legacy PCRE by explicit request. - For now, require opt-in via --with-components="...;PCRE2;..." (CMake), --with-pcre2 (traditional Unix build system), or uncommenting the relevant ThirdParty_PCRE2 setting in project_tree_builder.ini (traditional Windows build system). - Likewise, hold off on switching the bundled copy to PCRE2 or checking for any functions or headers that will become of interest. - Redundantly (for now) shun external PCRE2 in bin-release configurations. - Otherwise favor (allowed!) external installations over the bundled copy, preferring PCRE2 over legacy PCRE when both are found and allowed but (in due course) external legacy PCRE over bundled PCRE2 in the absence of external PCRE2. - In the traditional build system, have the widely used PCRE_LIBS macro correspond to whichever PCRE is default (when not falling back on a bundled copy), and add a PCRE_LEGACY_LIBS macro for the sake of anything using legacy PCRE directly (very occasionally seen). JIRA: CXX-12761, git-svn-id: https://anonsvn.ncbi.nlm.nih.gov/repos/v1/trunk/c++@102806 78c7ea69-d796-4a43-9a09-de51944f1b03 Irrelevant (and inapplicable) changes to c++/src/build-system/cmake/, c++/src/build-system/configure (which will be regenerated anyway), and c++/src/build-system/project_tree_builder.ini elided. Gbp-Pq: Name allow_pcre2 --- c++/include/util/xregexp/regexp.hpp | 13 ++++- c++/src/build-system/Makefile.mk.in | 3 + c++/src/build-system/config.h.in | 3 + c++/src/build-system/configure.ac | 34 +++++++++-- c++/src/util/xregexp/CMakeLists.txt | 7 ++- c++/src/util/xregexp/Makefile.xregexp.lib | 6 +- c++/src/util/xregexp/regexp.cpp | 69 +++++++++++++++++++++-- 7 files changed, 121 insertions(+), 14 deletions(-) diff --git a/c++/include/util/xregexp/regexp.hpp b/c++/include/util/xregexp/regexp.hpp index 332b646b..e7d00f97 100644 --- a/c++/include/util/xregexp/regexp.hpp +++ b/c++/include/util/xregexp/regexp.hpp @@ -70,7 +70,11 @@ class NCBI_XREGEXP_EXPORT CRegexp { public: /// Element type for GetResults(). +#ifdef HAVE_LIBPCRE2 + typedef size_t TOffset; +#else typedef int TOffset; +#endif /// Type definitions used for code clarity. typedef unsigned int TCompile; ///< Compilation options. @@ -287,13 +291,20 @@ private: void x_Match(CTempString str, size_t offset, TMatch flags); void* m_PReg; /// Pointer to compiled PCRE pattern. - void* m_Extra; /// Pointer to extra structure used for pattern study. +#ifdef HAVE_LIBPCRE2 + void* m_MatchData; + TOffset* m_Results; + int m_JITStatus; +#else + void* m_Extra; /// Pointer to extra structure used for pattern study. /// Array of locations of patterns/subpatterns resulting from /// the last call to GetMatch(). Also contains 1/3 extra space used /// internally by the PCRE C library. int m_Results[(kRegexpMaxSubPatterns +1) * 3]; +#endif + /// The total number of pattern + subpatterns resulting from /// the last call to GetMatch. diff --git a/c++/src/build-system/Makefile.mk.in b/c++/src/build-system/Makefile.mk.in index f95e0fed..da5cefa0 100644 --- a/c++/src/build-system/Makefile.mk.in +++ b/c++/src/build-system/Makefile.mk.in @@ -390,6 +390,9 @@ CMPRS_LIB = $(Z_LIB) $(BZ2_LIB) $(ZSTD_LIB) # wrapper and goes by the name "regexp". PCRE_INCLUDE = @PCRE_INCLUDE@ PCRE_LIBS = @PCRE_LIBS@ +PCRE_LEGACY_LIBS = @PCRE_LEGACY_LIBS@ +PCRE2_INCLUDE = @PCRE2_INCLUDE@ +PCRE2_LIBS = @PCRE2_LIBS@ PCREPOSIX_LIBS = @PCREPOSIX_LIBS@ PCRE_LIB = @PCRE_LIB@ diff --git a/c++/src/build-system/config.h.in b/c++/src/build-system/config.h.in index 5c97eab5..eb17f931 100644 --- a/c++/src/build-system/config.h.in +++ b/c++/src/build-system/config.h.in @@ -486,6 +486,9 @@ /* Define to 1 if libpcre is available. */ #undef HAVE_LIBPCRE +/* Define to 1 if libpcre2 is available. */ +#undef HAVE_LIBPCRE2 + /* Define to 1 if libpng is available. */ #undef HAVE_LIBPNG diff --git a/c++/src/build-system/configure.ac b/c++/src/build-system/configure.ac index 8b845ceb..7f4f962a 100644 --- a/c++/src/build-system/configure.ac +++ b/c++/src/build-system/configure.ac @@ -90,7 +90,7 @@ case "$with_3psw" in with_yaml_cpp=no fi m4_foreach(X, [sss, sssutils, sssdb, vdb, ngs, ncbicrypt, libunwind, - z, bz2, lzo, zstd, pcre, mbedtls, + z, bz2, lzo, zstd, pcre, pcre2, mbedtls, gmp, gcrypt, nettle, gnutls, openssl, krb5, boost, lmdb, sybase, ftds, mysql, opengl, mesa, glut, glew, gl2ps, wxwidgets, freetype, ftgl, fastcgi, fastcgipp, @@ -324,6 +324,10 @@ AC_ARG_WITH(pcre, [ --with-pcre=DIR use PCRE installation in DIR]) AC_ARG_WITH(pcre, [ --without-pcre use internal copy of PCRE]) +AC_ARG_WITH(pcre2, + [ --with-pcre2=DIR use PCRE2 installation in DIR]) +AC_ARG_WITH(pcre, + [ --without-pcre2 do not use PCRE2]) AC_ARG_WITH(mbedtls, [ --with-mbedtls(=DIR) use external mbedTLS installation (in DIR)]) AC_ARG_WITH(gmp, @@ -728,7 +732,7 @@ ncbi-c wxwidgets wxwidgets-ucs fastcgi fastcgipp \ sss sssdb sssutils included-sss \ geo included-geo vdb downloaded-vdb static-vdb ngs ncbicrypt libunwind libdw \ backward-cpp backward-cpp-sig \ -z bz2 lzo zstd pcre mbedtls \ +z bz2 lzo zstd pcre pcre2 mbedtls \ gmp gcrypt nettle gnutls static-gnutls openssl krb5 \ sybase sybase-local sybase-new ftds mysql \ orbacus freetype ftgl opengl mesa glut glew glew-mx gl2ps \ @@ -798,7 +802,7 @@ for x_arg in "$@" ; do | --with-ncbicrypt=* \ | --with-libunwind=* | --with-libdw=* | --with-backward-cpp=* \ | --with-z=* | --with-bz2=* | --with-lzo=* | --with-zstd=* \ - | --with-pcre=* | --with-mbedtls=* \ + | --with-pcre=* | --with-pcre2=* | --with-mbedtls=* \ | --with-gmp=* | --with-gcrypt=* | --with-nettle=* \ | --with-gnutls=* | --with-openssl=* | --with-krb5=* \ | --with-sybase-local=* | --with-ftds=*/* | --with-mysql=* \ @@ -892,6 +896,7 @@ if test "$with_bin_release" = "yes" ; then : ${with_libdw=no} : ${with_ncbicrypt=no} : ${with_pcre=no} # Too much variation across distributions. + : ${with_pcre2=no} : ${with_sse42=no} AC_DEFINE(NCBI_BIN_RELEASE, 1, [Define to 1 when building binaries for public release.]) @@ -4832,7 +4837,18 @@ fi NCBI_CHECK_THIRD_PARTY_LIB(pcre, [AC_LANG_PROGRAM([@%:@include ], [[const char*s[]={"x"}; pcre* p; pcre_extra* x = pcre_study(p, 1, s);]])]) -if test -z "$PCRE_LIBS"; then + +: ${with_pcre2=no} +NCBI_CHECK_THIRD_PARTY_LIB_EX(pcre2, PCRE2, pcre2-8, + [AC_LANG_PROGRAM([@%:@define PCRE2_CODE_UNIT_WIDTH 8 + @%:@include ], + [[pcre2_config(123, NULL);]])]) +PCRE_LEGACY_LIBS=$PCRE_LIBS +if test "x$with_pcre2" != xno; then + PCRE_LIBS=$PCRE2_LIBS +fi + +if test -z "$PCRE_LIBS" -a -z "$PCRE2_LIBS"; then pcrelocal=util/regexp AC_MSG_NOTICE([using local PCRE copy in $pcrelocal]) PCRE_PATH="<$pcrelocal>" @@ -4841,8 +4857,15 @@ if test -z "$PCRE_LIBS"; then # PCREPOSIX_LIBS="-lregexp" PCRE_LIB="regexp" AC_DEFINE(USE_LOCAL_PCRE, 1, [Define to 1 if using a local copy of PCRE.]) - NCBI_PACKAGE(PCRE) + if test -f $srcdir/include/$pcrelocal/pcre2.h; then + AC_DEFINE(HAVE_LIBPCRE2, 1, [Define to 1 if libpcre2 is available.]) + NCBI_PACKAGE(PCRE2) + else + NCBI_PACKAGE(PCRE) + fi NCBI_PACKAGE(LocalPCRE) +elif test -n "$PCRE2_LIBS"; then + PCREPOSIX_LIBS=`echo "$PCRE2_LIBS" | sed -e 's/-lpcre2-8/-lpcre2posix &/'` else PCREPOSIX_LIBS=`echo "$PCRE_LIBS" | sed -e 's/-lpcre/-lpcreposix -lpcre/'` fi @@ -9871,6 +9894,7 @@ AC_SUBST(BZ2_LIB) AC_SUBST(ZSTD_STATIC_LIBS) AC_SUBST(PCREPOSIX_LIBS) AC_SUBST(PCRE_LIB) +AC_SUBST(PCRE_LEGACY_LIBS) AC_SUBST(OPENSSL_STATIC_LIBS) AC_SUBST(CURL_STATIC_LIBS) AC_SUBST(SYBASE_PATH) diff --git a/c++/src/util/xregexp/CMakeLists.txt b/c++/src/util/xregexp/CMakeLists.txt index d1c958ff..bca19d4a 100644 --- a/c++/src/util/xregexp/CMakeLists.txt +++ b/c++/src/util/xregexp/CMakeLists.txt @@ -1,6 +1,11 @@ # $Id: CMakeLists.txt 621427 2020-12-11 14:26:55Z ivanov $ NCBI_project_tags(core) -NCBI_requires(PCRE) +# NCBI_optional_components(PCRE PCRE2) +if(NCBI_COMPONENT_PCRE2_FOUND) + NCBI_REQUIRES(PCRE2) +else() + NCBI_REQUIRES(PCRE) +endif() NCBI_add_library(xregexp xregexp_template_tester) diff --git a/c++/src/util/xregexp/Makefile.xregexp.lib b/c++/src/util/xregexp/Makefile.xregexp.lib index d439e4b8..81e0f3c2 100644 --- a/c++/src/util/xregexp/Makefile.xregexp.lib +++ b/c++/src/util/xregexp/Makefile.xregexp.lib @@ -3,12 +3,12 @@ SRC = regexp arg_regexp mask_regexp convert_dates_iso8601 LIB = xregexp -CPPFLAGS = $(ORIG_CPPFLAGS) $(PCRE_INCLUDE) +CPPFLAGS = $(ORIG_CPPFLAGS) $(PCRE_INCLUDE) $(PCRE2_INCLUDE) DLL_LIB = $(PCRE_LIB) xutil -LIBS = $(PCRE_LIBS) +LIBS = $(PCRE_LIBS) $(PCRE2_LIBS) USES_LIBRARIES = \ - $(PCRE_LIB) $(PCRE_LIBS) xncbi + $(PCRE_LIB) $(PCRE_LIBS) $(PCRE2_LIBS) xncbi WATCHERS = ivanov diff --git a/c++/src/util/xregexp/regexp.cpp b/c++/src/util/xregexp/regexp.cpp index 9f9c3961..cb61e8b8 100644 --- a/c++/src/util/xregexp/regexp.cpp +++ b/c++/src/util/xregexp/regexp.cpp @@ -34,8 +34,14 @@ #include #include #include -#include -#define PCRE_FLAG(x) PCRE_##x +#ifdef HAVE_LIBPCRE2 +# define PCRE2_CODE_UNIT_WIDTH 8 +# include +# define PCRE_FLAG(x) PCRE2_##x +#else +# include +# define PCRE_FLAG(x) PCRE_##x +#endif #include #include @@ -103,7 +109,15 @@ static int s_GetRealMatchFlags(CRegexp::TMatch match_flags) CRegexp::CRegexp(CTempStringEx pattern, TCompile flags) - : m_PReg(NULL), m_Extra(NULL), m_NumFound(0) + : m_PReg(NULL), +#ifdef HAVE_LIBPCRE2 + m_MatchData(NULL), + m_Results(NULL), + m_JITStatus(PCRE2_ERROR_UNSET), +#else + m_Extra(NULL), +#endif + m_NumFound(0) { Set(pattern, flags); } @@ -111,33 +125,63 @@ CRegexp::CRegexp(CTempStringEx pattern, TCompile flags) CRegexp::~CRegexp() { +#ifdef HAVE_LIBPCRE2 + pcre2_code_free((pcre2_code*)m_PReg); + pcre2_match_data_free((pcre2_match_data*)m_MatchData); +#else (*pcre_free)(m_PReg); (*pcre_free)(m_Extra); +#endif } void CRegexp::Set(CTempStringEx pattern, TCompile flags) { if ( m_PReg ) { +#ifdef HAVE_LIBPCRE2 + pcre2_code_free((pcre2_code*)m_PReg); +#else (*pcre_free)(m_PReg); +#endif } +#ifdef HAVE_LIBPCRE2 + int err_num; +#else const char *err; +#endif TOffset err_offset; int x_flags = s_GetRealCompileFlags(flags); +#ifdef HAVE_LIBPCRE2 + m_PReg = pcre2_compile((PCRE2_SPTR) pattern.data(), pattern.size(), + x_flags, &err_num, &err_offset, NULL); +#else if ( pattern.HasZeroAtEnd() ) { m_PReg = pcre_compile(pattern.data(), x_flags, &err, &err_offset, NULL); } else { m_PReg = pcre_compile(string(pattern).c_str(), x_flags, &err, &err_offset, NULL); } +#endif if ( !m_PReg ) { +#ifdef HAVE_LIBPCRE2 + char err[120]; + pcre2_get_error_message(err_num, (PCRE2_UCHAR*) err, ArraySize(err)); +#endif NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" + string(pattern) + "' failed: " + err); } +#ifdef HAVE_LIBPCRE2 + pcre2_match_data_free((pcre2_match_data*)m_MatchData); + m_MatchData = pcre2_match_data_create_from_pattern((pcre2_code*)m_PReg, + NULL); + // Too heavyweight to use by default; a flag may be in order. + // m_JITStatus = pcre2_jit_compile((pcre2_code*)m_PReg, PCRE2_JIT_COMPLETE); +#else if ( m_Extra ) { (*pcre_free)(m_Extra); } m_Extra = pcre_study((pcre*)m_PReg, 0, &err); +#endif } @@ -158,8 +202,12 @@ CTempString CRegexp::GetSub(CTempString str, size_t idx) const if ( (int)idx >= m_NumFound ) { return CTempString(); } +#ifdef HAVE_LIBPCRE2 + static const PCRE2_SIZE kNotFound = PCRE2_UNSET; +#else static const int kNotFound = -1; - const int * offsets = m_Results; +#endif + const auto * offsets = m_Results; auto start = offsets[2 * idx]; auto end = offsets[2 * idx + 1]; if (start == kNotFound || end == kNotFound) { @@ -172,10 +220,23 @@ CTempString CRegexp::GetSub(CTempString str, size_t idx) const void CRegexp::x_Match(CTempString str, size_t offset, TMatch flags) { int x_flags = s_GetRealMatchFlags(flags); +#ifdef HAVE_LIBPCRE2 + auto f = (m_JITStatus == 0) ? &pcre2_jit_match : &pcre2_match; + auto *match_data = (pcre2_match_data*) m_MatchData; + int rc = (*f)((pcre2_code*) m_PReg, (PCRE2_UCHAR*)str.data(), str.length(), + offset, x_flags, match_data, NULL); + m_Results = pcre2_get_ovector_pointer(match_data); + if (rc >= 0) { + m_NumFound = pcre2_get_ovector_count(match_data); + } else { + m_NumFound = -1; + } +#else m_NumFound = pcre_exec((pcre*)m_PReg, (pcre_extra*)m_Extra, str.data(), (int)str.length(), (int)offset, x_flags, m_Results, (int)(kRegexpMaxSubPatterns +1) * 3); +#endif } -- 2.30.2