Make it possible to build against PCRE2, and do so (#891197).
authorAaron M. Ucko <ucko@debian.org>
Tue, 23 Jul 2024 03:31:22 +0000 (23:31 -0400)
committerAaron M. Ucko <ucko@debian.org>
Tue, 23 Jul 2024 03:31:22 +0000 (23:31 -0400)
* debian/control: Build against libpcre2-dev.
* debian/patches/allow_pcre2 (new): Allow use of PCRE2.
* debian/rules: Build against PCRE2.

debian/changelog
debian/control
debian/patches/allow_pcre2 [new file with mode: 0644]
debian/patches/series
debian/rules

index c308f61dc007b00693103a194facdfe9be4364d5..325343e2897aff1866e4593f37099fc0c46e53dd 100644 (file)
@@ -6,7 +6,9 @@ ncbi-blast+ (2.15.0+ds-1) UNRELEASED; urgency=medium
     - Build against libnghttp2-dev, libsqlite3-dev, and libuv1-dev, newly
       needed.
     - Build against autoconf2.69.
+    - Build against libpcre2-dev, as enabled by debian/patches/allow_pcre2.
   * debian/patches: Update for new release, mostly formally.
+  * debian/patches/allow_pcre2 (new): Allow use of PCRE2.
   * debian/patches/series: Move system_mbedtls_only up in conjunction with
     incorporating part of an upstream patch needed by support_mbedtls3.
   * debian/patches/support_mbedtls3 (new): Support (but don't require)
@@ -16,6 +18,8 @@ ncbi-blast+ (2.15.0+ds-1) UNRELEASED; urgency=medium
     and Makefile.xxconnect2.lib.
   * debian/rules:
     - Regenerate configure and config.h.in with Autoconf 2.69.
+    - Build against PCRE2, as enabled by debian/patches/allow_pcre2.
+      (Closes: #891197.)
     - Extend generated-sources cleanup to objtools/eutils (not entirely
       straightforward due to input format variation).
     - Tune flags: Add --without-strip to make --with-symbols effective;
index 77bf12ddf5798c389339528512056a88a4c61d9f..ef3909726bd22c94b987508f35cb174458d62719 100644 (file)
@@ -15,6 +15,7 @@ Build-Depends-Arch: 2to3,
                     liblmdb-dev,
                     libmbedtls-dev,
                     libnghttp2-dev,
+                    libpcre2-dev,
                     libsqlite3-dev,
                     libuv1-dev,
                     time,
diff --git a/debian/patches/allow_pcre2 b/debian/patches/allow_pcre2
new file mode 100644 (file)
index 0000000..bfef446
--- /dev/null
@@ -0,0 +1,372 @@
+From d615c51fb9fe6172a84feea47906abd164198fd5 Mon Sep 17 00:00:00 2001
+From: ucko <ucko@ncbi.nlm.nih.gov>
+Date: Thu, 18 Jul 2024 18:33:56 +0000
+Subject: [PATCH] Allow substituting PCRE2 for legacy PCRE by explicit request.
+
+- For now, require opt-in via --with-components="...;PCRE2;..." (CMake),
+  --with-pcre2 (traditional Unix build system), or uncommenting the
+  relevant ThirdParty_PCRE2 setting in project_tree_builder.ini
+  (traditional Windows build system).
+- Likewise, hold off on switching the bundled copy to PCRE2 or checking
+  for any functions or headers that will become of interest.
+- Redundantly (for now) shun external PCRE2 in bin-release configurations.
+- Otherwise favor (allowed!) external installations over the bundled
+  copy, preferring PCRE2 over legacy PCRE when both are found and
+  allowed but (in due course) external legacy PCRE over bundled PCRE2 in
+  the absence of external PCRE2.
+- In the traditional build system, have the widely used PCRE_LIBS macro
+  correspond to whichever PCRE is default (when not falling back on a
+  bundled copy), and add a PCRE_LEGACY_LIBS macro for the sake of
+  anything using legacy PCRE directly (very occasionally seen).
+
+JIRA: CXX-12761,
+
+git-svn-id: https://anonsvn.ncbi.nlm.nih.gov/repos/v1/trunk/c++@102806 78c7ea69-d796-4a43-9a09-de51944f1b03
+
+Irrelevant (and inapplicable) changes to c++/src/build-system/cmake/,
+c++/src/build-system/configure (which will be regenerated anyway), and
+c++/src/build-system/project_tree_builder.ini elided.
+---
+ include/util/xregexp/regexp.hpp               |  13 +-
+ src/build-system/Makefile.mk.in               |   3 +
+ .../cmake/CMake.NCBIComponents.cmake          |   2 +-
+ .../cmake/CMake.NCBIComponentsMSVC.cmake      |  17 +-
+ .../cmake/CMake.NCBIComponentsPackage.cmake   |  10 +-
+ .../cmake/CMake.NCBIComponentsUNIXex.cmake    |  18 +-
+ .../cmake/CMake.NCBIComponentsXCODE.cmake     |   9 +
+ .../cmake/CMakeChecks.compiler.cmake          |   1 +
+ src/build-system/cmake/conanfile.py           |   2 +
+ src/build-system/cmake/config.cmake.h.in      |   3 +
+ src/build-system/config.h.in                  |   3 +
+ src/build-system/configure                    | 181 +++++++++++++++++-
+ src/build-system/configure.ac                 |  34 +++-
+ src/build-system/project_tree_builder.ini     |  21 ++
+ src/util/xregexp/CMakeLists.txt               |   7 +-
+ src/util/xregexp/Makefile.xregexp.lib         |   6 +-
+ src/util/xregexp/regexp.cpp                   |  69 ++++++-
+ 17 files changed, 376 insertions(+), 23 deletions(-)
+
+--- a/c++/include/util/xregexp/regexp.hpp
++++ b/c++/include/util/xregexp/regexp.hpp
+@@ -70,7 +70,11 @@ class NCBI_XREGEXP_EXPORT CRegexp
+ {
+ public:
+     /// Element type for GetResults().
++#ifdef HAVE_LIBPCRE2
++    typedef size_t TOffset;
++#else
+     typedef int TOffset;
++#endif
+     /// Type definitions used for code clarity.
+     typedef unsigned int TCompile;     ///< Compilation options.
+@@ -287,13 +291,20 @@ private:
+     void x_Match(CTempString str, size_t offset, TMatch flags);
+     void*  m_PReg;   /// Pointer to compiled PCRE pattern.
+-    void*  m_Extra;  /// Pointer to extra structure used for pattern study.
++#ifdef HAVE_LIBPCRE2
++    void*    m_MatchData;
++    TOffset* m_Results;
++    int      m_JITStatus;
++#else
++    void*  m_Extra;  /// Pointer to extra structure used for pattern study.
+     /// Array of locations of patterns/subpatterns resulting from
+     /// the last call to GetMatch(). Also contains 1/3 extra space used
+     /// internally by the PCRE C library.
+     int m_Results[(kRegexpMaxSubPatterns +1) * 3];
++#endif
++
+     /// The total number of pattern + subpatterns resulting from
+     /// the last call to GetMatch.
+--- a/c++/src/build-system/Makefile.mk.in
++++ b/c++/src/build-system/Makefile.mk.in
+@@ -390,6 +390,9 @@ CMPRS_LIB     = $(Z_LIB) $(BZ2_LIB) $(ZS
+ # wrapper and goes by the name "regexp".
+ PCRE_INCLUDE   = @PCRE_INCLUDE@
+ PCRE_LIBS      = @PCRE_LIBS@
++PCRE_LEGACY_LIBS = @PCRE_LEGACY_LIBS@
++PCRE2_INCLUDE  = @PCRE2_INCLUDE@
++PCRE2_LIBS     = @PCRE2_LIBS@
+ PCREPOSIX_LIBS = @PCREPOSIX_LIBS@
+ PCRE_LIB       = @PCRE_LIB@
+--- a/c++/src/build-system/config.h.in
++++ b/c++/src/build-system/config.h.in
+@@ -486,6 +486,9 @@
+ /* Define to 1 if libpcre is available. */
+ #undef HAVE_LIBPCRE
++/* Define to 1 if libpcre2 is available. */
++#undef HAVE_LIBPCRE2
++
+ /* Define to 1 if libpng is available. */
+ #undef HAVE_LIBPNG
+--- a/c++/src/build-system/configure.ac
++++ b/c++/src/build-system/configure.ac
+@@ -90,7 +90,7 @@ case "$with_3psw" in
+          with_yaml_cpp=no
+       fi
+       m4_foreach(X, [sss, sssutils, sssdb, vdb, ngs, ncbicrypt, libunwind,
+-                     z, bz2, lzo, zstd, pcre, mbedtls,
++                     z, bz2, lzo, zstd, pcre, pcre2, mbedtls,
+                      gmp, gcrypt, nettle, gnutls, openssl, krb5, boost, lmdb,
+                      sybase, ftds, mysql, opengl, mesa, glut, glew, gl2ps,
+                      wxwidgets, freetype, ftgl, fastcgi, fastcgipp,
+@@ -324,6 +324,10 @@ AC_ARG_WITH(pcre,
+    [ --with-pcre=DIR         use PCRE installation in DIR])
+ AC_ARG_WITH(pcre,
+    [ --without-pcre          use internal copy of PCRE])
++AC_ARG_WITH(pcre2,
++   [ --with-pcre2=DIR        use PCRE2 installation in DIR])
++AC_ARG_WITH(pcre,
++   [ --without-pcre2         do not use PCRE2])
+ AC_ARG_WITH(mbedtls,
+    [ --with-mbedtls(=DIR)    use external mbedTLS installation (in DIR)])
+ AC_ARG_WITH(gmp,
+@@ -728,7 +732,7 @@ ncbi-c wxwidgets wxwidgets-ucs fastcgi f
+ sss sssdb sssutils included-sss \
+ geo included-geo vdb downloaded-vdb static-vdb ngs ncbicrypt libunwind libdw \
+ backward-cpp backward-cpp-sig \
+-z bz2 lzo zstd pcre mbedtls \
++z bz2 lzo zstd pcre pcre2 mbedtls \
+ gmp gcrypt nettle gnutls static-gnutls openssl krb5 \
+ sybase sybase-local sybase-new ftds mysql \
+ orbacus freetype ftgl opengl mesa glut glew glew-mx gl2ps \
+@@ -798,7 +802,7 @@ for x_arg in "$@" ; do
+       | --with-ncbicrypt=* \
+       | --with-libunwind=* | --with-libdw=* | --with-backward-cpp=* \
+       | --with-z=* | --with-bz2=* | --with-lzo=* | --with-zstd=* \
+-      | --with-pcre=* | --with-mbedtls=* \
++      | --with-pcre=* | --with-pcre2=* | --with-mbedtls=* \
+       | --with-gmp=* | --with-gcrypt=* | --with-nettle=* \
+       | --with-gnutls=* | --with-openssl=* | --with-krb5=* \
+       | --with-sybase-local=* | --with-ftds=*/* | --with-mysql=* \
+@@ -892,6 +896,7 @@ if test "$with_bin_release" = "yes" ; th
+    : ${with_libdw=no}
+    : ${with_ncbicrypt=no}
+    : ${with_pcre=no} # Too much variation across distributions.
++   : ${with_pcre2=no}
+    : ${with_sse42=no}
+    AC_DEFINE(NCBI_BIN_RELEASE, 1,
+              [Define to 1 when building binaries for public release.])
+@@ -4832,7 +4837,18 @@ fi
+ NCBI_CHECK_THIRD_PARTY_LIB(pcre,
+  [AC_LANG_PROGRAM([@%:@include <pcre.h>],
+    [[const char*s[]={"x"}; pcre* p; pcre_extra* x = pcre_study(p, 1, s);]])])
+-if test -z "$PCRE_LIBS"; then
++
++: ${with_pcre2=no}
++NCBI_CHECK_THIRD_PARTY_LIB_EX(pcre2, PCRE2, pcre2-8,
++ [AC_LANG_PROGRAM([@%:@define PCRE2_CODE_UNIT_WIDTH 8
++                   @%:@include <pcre2.h>],
++   [[pcre2_config(123, NULL);]])])
++PCRE_LEGACY_LIBS=$PCRE_LIBS
++if test "x$with_pcre2" != xno; then
++   PCRE_LIBS=$PCRE2_LIBS
++fi
++
++if test -z "$PCRE_LIBS" -a -z "$PCRE2_LIBS"; then
+    pcrelocal=util/regexp
+    AC_MSG_NOTICE([using local PCRE copy in $pcrelocal])
+    PCRE_PATH="<$pcrelocal>"
+@@ -4841,8 +4857,15 @@ if test -z "$PCRE_LIBS"; then
+    # PCREPOSIX_LIBS="-lregexp"
+    PCRE_LIB="regexp"
+    AC_DEFINE(USE_LOCAL_PCRE, 1, [Define to 1 if using a local copy of PCRE.])
+-   NCBI_PACKAGE(PCRE)
++   if test -f $srcdir/include/$pcrelocal/pcre2.h; then
++      AC_DEFINE(HAVE_LIBPCRE2, 1, [Define to 1 if libpcre2 is available.])
++      NCBI_PACKAGE(PCRE2)
++   else
++      NCBI_PACKAGE(PCRE)
++   fi
+    NCBI_PACKAGE(LocalPCRE)
++elif test -n "$PCRE2_LIBS"; then
++   PCREPOSIX_LIBS=`echo "$PCRE2_LIBS" | sed -e 's/-lpcre2-8/-lpcre2posix &/'`
+ else
+    PCREPOSIX_LIBS=`echo "$PCRE_LIBS" | sed -e 's/-lpcre/-lpcreposix -lpcre/'`
+ fi
+@@ -9765,6 +9788,7 @@ AC_SUBST(BZ2_LIB)
+ AC_SUBST(ZSTD_STATIC_LIBS)
+ AC_SUBST(PCREPOSIX_LIBS)
+ AC_SUBST(PCRE_LIB)
++AC_SUBST(PCRE_LEGACY_LIBS)
+ AC_SUBST(OPENSSL_STATIC_LIBS)
+ AC_SUBST(CURL_STATIC_LIBS)
+ AC_SUBST(SYBASE_PATH)
+--- a/c++/src/util/xregexp/CMakeLists.txt
++++ b/c++/src/util/xregexp/CMakeLists.txt
+@@ -1,6 +1,11 @@
+ # $Id: CMakeLists.txt 621427 2020-12-11 14:26:55Z ivanov $
+ NCBI_project_tags(core)
+-NCBI_requires(PCRE)
++# NCBI_optional_components(PCRE PCRE2)
++if(NCBI_COMPONENT_PCRE2_FOUND)
++  NCBI_REQUIRES(PCRE2)
++else()
++  NCBI_REQUIRES(PCRE)
++endif()
+ NCBI_add_library(xregexp xregexp_template_tester)
+--- a/c++/src/util/xregexp/Makefile.xregexp.lib
++++ b/c++/src/util/xregexp/Makefile.xregexp.lib
+@@ -3,12 +3,12 @@
+ SRC = regexp arg_regexp mask_regexp convert_dates_iso8601
+ LIB = xregexp
+-CPPFLAGS = $(ORIG_CPPFLAGS) $(PCRE_INCLUDE)
++CPPFLAGS = $(ORIG_CPPFLAGS) $(PCRE_INCLUDE) $(PCRE2_INCLUDE)
+ DLL_LIB = $(PCRE_LIB) xutil
+-LIBS    = $(PCRE_LIBS)
++LIBS    = $(PCRE_LIBS) $(PCRE2_LIBS)
+ USES_LIBRARIES =  \
+-    $(PCRE_LIB) $(PCRE_LIBS) xncbi
++    $(PCRE_LIB) $(PCRE_LIBS) $(PCRE2_LIBS) xncbi
+ WATCHERS = ivanov
+--- a/c++/src/util/xregexp/regexp.cpp
++++ b/c++/src/util/xregexp/regexp.cpp
+@@ -34,8 +34,14 @@
+ #include <corelib/ncbi_limits.h>
+ #include <corelib/ncbistl.hpp>
+ #include <util/xregexp/regexp.hpp>
+-#include <pcre.h>
+-#define PCRE_FLAG(x) PCRE_##x
++#ifdef HAVE_LIBPCRE2
++#  define PCRE2_CODE_UNIT_WIDTH 8
++#  include <pcre2.h>
++#  define PCRE_FLAG(x) PCRE2_##x
++#else
++#  include <pcre.h>
++#  define PCRE_FLAG(x) PCRE_##x
++#endif
+ #include <memory>
+ #include <stdlib.h>
+@@ -103,7 +109,15 @@ static int s_GetRealMatchFlags(CRegexp::
+ CRegexp::CRegexp(CTempStringEx pattern, TCompile flags)
+-    : m_PReg(NULL), m_Extra(NULL), m_NumFound(0)
++    : m_PReg(NULL),
++#ifdef HAVE_LIBPCRE2
++      m_MatchData(NULL),
++      m_Results(NULL),
++      m_JITStatus(PCRE2_ERROR_UNSET),
++#else
++      m_Extra(NULL),
++#endif
++      m_NumFound(0)
+ {
+     Set(pattern, flags);
+ }
+@@ -111,33 +125,63 @@ CRegexp::CRegexp(CTempStringEx pattern,
+ CRegexp::~CRegexp()
+ {
++#ifdef HAVE_LIBPCRE2
++    pcre2_code_free((pcre2_code*)m_PReg);
++    pcre2_match_data_free((pcre2_match_data*)m_MatchData);
++#else
+     (*pcre_free)(m_PReg);
+     (*pcre_free)(m_Extra);
++#endif
+ }
+ void CRegexp::Set(CTempStringEx pattern, TCompile flags)
+ {
+     if ( m_PReg ) {
++#ifdef HAVE_LIBPCRE2
++        pcre2_code_free((pcre2_code*)m_PReg);
++#else
+         (*pcre_free)(m_PReg);
++#endif
+     }
++#ifdef HAVE_LIBPCRE2
++    int err_num;
++#else
+     const char *err;
++#endif
+     TOffset err_offset;
+     int x_flags = s_GetRealCompileFlags(flags);
++#ifdef HAVE_LIBPCRE2
++    m_PReg = pcre2_compile((PCRE2_SPTR) pattern.data(), pattern.size(),
++                           x_flags, &err_num, &err_offset, NULL);
++#else
+     if ( pattern.HasZeroAtEnd() ) {
+         m_PReg = pcre_compile(pattern.data(), x_flags, &err, &err_offset, NULL);
+     } else {
+         m_PReg = pcre_compile(string(pattern).c_str(), x_flags, &err, &err_offset, NULL);
+     }
++#endif
+     if ( !m_PReg ) {
++#ifdef HAVE_LIBPCRE2
++        char err[120];
++        pcre2_get_error_message(err_num, (PCRE2_UCHAR*) err, ArraySize(err));
++#endif
+         NCBI_THROW(CRegexpException, eCompile, "Compilation of the pattern '" +
+                    string(pattern) + "' failed: " + err);
+     }
++#ifdef HAVE_LIBPCRE2
++    pcre2_match_data_free((pcre2_match_data*)m_MatchData);
++    m_MatchData = pcre2_match_data_create_from_pattern((pcre2_code*)m_PReg,
++                                                       NULL);
++    // Too heavyweight to use by default; a flag may be in order.
++    // m_JITStatus = pcre2_jit_compile((pcre2_code*)m_PReg, PCRE2_JIT_COMPLETE);
++#else
+     if ( m_Extra ) {
+         (*pcre_free)(m_Extra);
+     }
+     m_Extra = pcre_study((pcre*)m_PReg, 0, &err);
++#endif
+ }
+@@ -158,8 +202,12 @@ CTempString CRegexp::GetSub(CTempString
+     if ( (int)idx >= m_NumFound ) {
+         return CTempString();
+     }
++#ifdef HAVE_LIBPCRE2
++    static const PCRE2_SIZE kNotFound = PCRE2_UNSET;
++#else
+     static const int kNotFound = -1;
+-    const int * offsets = m_Results;
++#endif
++    const auto * offsets = m_Results;
+     auto start = offsets[2 * idx];
+     auto end   = offsets[2 * idx + 1];
+     if (start == kNotFound  ||  end == kNotFound) {
+@@ -172,10 +220,23 @@ CTempString CRegexp::GetSub(CTempString
+ void CRegexp::x_Match(CTempString str, size_t offset, TMatch flags)
+ {
+     int x_flags = s_GetRealMatchFlags(flags);
++#ifdef HAVE_LIBPCRE2
++    auto f = (m_JITStatus == 0) ? &pcre2_jit_match : &pcre2_match;
++    auto *match_data = (pcre2_match_data*) m_MatchData;
++    int rc = (*f)((pcre2_code*) m_PReg, (PCRE2_UCHAR*)str.data(), str.length(),
++                  offset, x_flags, match_data, NULL);
++    m_Results = pcre2_get_ovector_pointer(match_data);
++    if (rc >= 0) {
++        m_NumFound = pcre2_get_ovector_count(match_data);
++    } else {
++        m_NumFound = -1;
++    }
++#else
+     m_NumFound = pcre_exec((pcre*)m_PReg, (pcre_extra*)m_Extra, str.data(),
+                            (int)str.length(), (int)offset,
+                            x_flags, m_Results,
+                            (int)(kRegexpMaxSubPatterns +1) * 3);
++#endif
+ }
index 9e989402a9ccb305f1b3a1495244435d85815402..e5ab9c3a9a3b59acec0fb98b9a0189fabdd274dc 100644 (file)
@@ -1,5 +1,6 @@
 system_mbedtls_only
 support_mbedtls3
+allow_pcre2
 optin_usage_report
 enable_clean_after_failed_compile
 hurd_fixes
index 9093c1ce4238879ac2444a25a844ee273b452bb3..3fffe16836296e8aee69453e16310ef34092d1ba 100755 (executable)
@@ -15,7 +15,8 @@ DEB_CONFIGURE_COMMON_FLAGS=--without-autodep --without-makefile-auto-update \
     --with-flat-makefile --without-caution --without-dbapi --without-lzo \
     --without-debug --without-downloaded-vdb --without-sse42
 DEB_CONFIGURE_EXTRA_FLAGS=$(DEB_CONFIGURE_COMMON_FLAGS) --with-dll --with-mt \
-    --with-runpath=/usr/lib/ncbi-blast+ --with-build-root=BUILD --with-mbedtls
+    --with-runpath=/usr/lib/ncbi-blast+ --with-build-root=BUILD \
+    --with-mbedtls --with-pcre2
 
 proj=algo/blast/ app/ objmgr/ objtools/align_format/ objtools/blast/