From f94ab1f4d0affa08d0af9e8b3ed5d15217245ebc Mon Sep 17 00:00:00 2001
From: Matthew Vernon
\n";
+$inpara = 1;
+}
+
+
+# Main program
+
+$innf = 0;
+$inpara = 0;
+$inpre = 0;
+$wrotetext = 0;
+$toc = 0;
+$ref = 1;
+
+while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
+ {
+ $toc = 1 if $ARGV[0] eq "-toc";
+ shift;
+ }
+
+# Initial output to STDOUT
+
+print <
+Return to the PCRE2 index page.
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+$ARGV[0] man page
+
+End
+
+print "\n" if ($toc);
+
+open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
+
+while (
\n" if ($toc);
+
+# Copy the remainder to the standard output
+
+close(TEMP);
+open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
+
+print while (
+ # and
that delimit literal sections will do the spacing. Always skip
+ # if no previous output.
+
+ elsif (/^\.sp/)
+ {
+ if ($wrotetext)
+ {
+ $_ =
\n
\n" if ($innf || /^\.nf/ || !/^[\s.]/);
+ }
+ redo; # Now process the lookahead line we just read
+ }
+ }
+ elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
+ {
+ &new_para();
+ }
+ elsif (/^\.SH\s*("?)(.*)\1/)
+ {
+ # Ignore the NAME section
+ if ($2 =~ /^NAME\b/)
+ {
+
$title
\n",
+ $ref);
+ $ref++;
+ }
+ else
+ {
+ print TEMP "
\n$title\n
\n";
+ }
+ }
+ elsif (/^\.SS\s*("?)(.*)\1/)
+ {
+ &end_para();
+ my($title) = &do_line($2);
+ print TEMP "
\n$title\n
\n";
+ }
+ elsif (/^\.B\s*(.*)/)
+ {
+ &new_para() if (!$inpara);
+ $_ = &do_line($1);
+ s/"(.*?)"/$1/g;
+ print TEMP "$_\n";
+ $wrotetext = 1;
+ }
+ elsif (/^\.I\s*(.*)/)
+ {
+ &new_para() if (!$inpara);
+ $_ = &do_line($1);
+ s/"(.*?)"/$1/g;
+ print TEMP "$_\n";
+ $wrotetext = 1;
+ }
+
+ # Remove the "AUTOMATICALLY GENERATED" warning from pcre2demo.3
+ elsif (/^\.\\"AUTOMATICALLY GENERATED/) { next; }
+
+ # A comment that starts "HREF" takes the next line as a name that
+ # is turned into a hyperlink, using the text given, which might be
+ # in a special font. If it ends in () or (digits) or punctuation, they
+ # aren't part of the link.
+
+ elsif (/^\.\\"\s*HREF/)
+ {
+ $_=\n";
+ while (
\n";
+ $inpre = 1;
+ }
+ }
+ elsif ($inpre)
+ {
+ print TEMP "\n";
+ $inpre = 0;
+ }
+
+ # Add
to the end of a non-literal line if we are within .nf/.fi
+
+ $_ .= "
\n" if (!$inpre && $innf);
+
+ print TEMP;
+ $wrotetext = 1;
+ }
+
+# The TOC, if present, will have been written - terminate it
+
+print "
+The HTML documentation for PCRE2 consists of a number of pages that are listed +below in alphabetical order. If you are new to PCRE2, please read the first one +first. +
+ +| pcre2 | +Introductory page |
| pcre2-config | +Information about the installation configuration |
| pcre2api | +PCRE2's native API |
| pcre2build | +Building PCRE2 |
| pcre2callout | +The callout facility |
| pcre2compat | +Compability with Perl |
| pcre2convert | +Experimental foreign pattern conversion functions |
| pcre2demo | +A demonstration C program that uses the PCRE2 library |
| pcre2grep | +The pcre2grep command |
| pcre2jit | +Discussion of the just-in-time optimization support |
| pcre2limits | +Details of size and other limits |
| pcre2matching | +Discussion of the two matching algorithms |
| pcre2partial | +Using PCRE2 for partial matching |
| pcre2pattern | +Specification of the regular expressions supported by PCRE2 |
| pcre2perform | +Some comments on performance |
| pcre2posix | +The POSIX API to the PCRE2 8-bit library |
| pcre2sample | +Discussion of the pcre2demo program |
| pcre2serialize | +Serializing functions for saving precompiled patterns |
| pcre2syntax | +Syntax quick-reference summary |
| pcre2test | +The pcre2test command for testing PCRE2 |
| pcre2unicode | +Discussion of Unicode and UTF-8/UTF-16/UTF-32 support |
+There are also individual pages that summarize the interface for each function +in the library. +
+ +| pcre2_callout_enumerate | +Enumerate callouts in a compiled pattern |
| pcre2_code_copy | +Copy a compiled pattern |
| pcre2_code_copy_with_tables | +Copy a compiled pattern and its character tables |
| pcre2_code_free | +Free a compiled pattern |
| pcre2_compile | +Compile a regular expression pattern |
| pcre2_compile_context_copy | +Copy a compile context |
| pcre2_compile_context_create | +Create a compile context |
| pcre2_compile_context_free | +Free a compile context |
| pcre2_config | +Show build-time configuration options |
| pcre2_convert_context_copy | +Copy a convert context |
| pcre2_convert_context_create | +Create a convert context |
| pcre2_convert_context_free | +Free a convert context |
| pcre2_converted_pattern_free | +Free converted foreign pattern |
| pcre2_dfa_match | +Match a compiled pattern to a subject string + (DFA algorithm; not Perl compatible) |
| pcre2_general_context_copy | +Copy a general context |
| pcre2_general_context_create | +Create a general context |
| pcre2_general_context_free | +Free a general context |
| pcre2_get_error_message | +Get textual error message for error number |
| pcre2_get_mark | +Get a (*MARK) name |
| pcre2_get_match_data_size | +Get the size of a match data block |
| pcre2_get_ovector_count | +Get the ovector count |
| pcre2_get_ovector_pointer | +Get a pointer to the ovector |
| pcre2_get_startchar | +Get the starting character offset |
| pcre2_jit_compile | +Process a compiled pattern with the JIT compiler |
| pcre2_jit_free_unused_memory | +Free unused JIT memory |
| pcre2_jit_match | +Fast path interface to JIT matching |
| pcre2_jit_stack_assign | +Assign stack for JIT matching |
| pcre2_jit_stack_create | +Create a stack for JIT matching |
| pcre2_jit_stack_free | +Free a JIT matching stack |
| pcre2_maketables | +Build character tables in current locale |
| pcre2_maketables_free | +Free character tables |
| pcre2_match | +Match a compiled pattern to a subject string + (Perl compatible) |
| pcre2_match_context_copy | +Copy a match context |
| pcre2_match_context_create | +Create a match context |
| pcre2_match_context_free | +Free a match context |
| pcre2_match_data_create | +Create a match data block |
| pcre2_match_data_create_from_pattern | +Create a match data block getting size from pattern |
| pcre2_match_data_free | +Free a match data block |
| pcre2_pattern_convert | +Experimental foreign pattern converter |
| pcre2_pattern_info | +Extract information about a pattern |
| pcre2_serialize_decode | +Decode serialized compiled patterns |
| pcre2_serialize_encode | +Serialize compiled patterns for save/restore |
| pcre2_serialize_free | +Free serialized compiled patterns |
| pcre2_serialize_get_number_of_codes | +Get number of serialized compiled patterns |
| pcre2_set_bsr | +Set \R convention |
| pcre2_set_callout | +Set up a callout function |
| pcre2_set_character_tables | +Set character tables |
| pcre2_set_compile_extra_options | +Set compile time extra options |
| pcre2_set_compile_recursion_guard | +Set up a compile recursion guard function |
| pcre2_set_depth_limit | +Set the match backtracking depth limit |
| pcre2_set_glob_escape | +Set glob escape character |
| pcre2_set_glob_separator | +Set glob separator character |
| pcre2_set_heap_limit | +Set the match backtracking heap limit |
| pcre2_set_match_limit | +Set the match limit |
| pcre2_set_max_pattern_compiled_length | +Set the maximum length of a compiled pattern |
| pcre2_set_max_pattern_length | +Set the maximum length of a pattern |
| pcre2_set_max_varlookbehind | +Set the maximum match length for a variable-length lookbehind |
| pcre2_set_newline | +Set the newline convention |
| pcre2_set_offset_limit | +Set the offset limit |
| pcre2_set_parens_nest_limit | +Set the parentheses nesting limit |
| pcre2_set_recursion_limit | +Obsolete: use pcre2_set_depth_limit |
| pcre2_set_recursion_memory_management | +Obsolete function that (from 10.30 onwards) does nothing |
| pcre2_substitute | +Match a compiled pattern to a subject string and do + substitutions |
| pcre2_substring_copy_byname | +Extract named substring into given buffer |
| pcre2_substring_copy_bynumber | +Extract numbered substring into given buffer |
| pcre2_substring_free | +Free extracted substring |
| pcre2_substring_get_byname | +Extract named substring into new memory |
| pcre2_substring_get_bynumber | +Extract numbered substring into new memory |
| pcre2_substring_length_byname | +Find length of named substring |
| pcre2_substring_length_bynumber | +Find length of numbered substring |
| pcre2_substring_list_free | +Free list of extracted substrings |
| pcre2_substring_list_get | +Extract all substrings into new memory |
| pcre2_substring_nametable_scan | +Find table entries for given string name |
| pcre2_substring_number_from_name | +Convert captured string name to number |
+Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+pcre2-config [--prefix] [--exec-prefix] [--version] + [--libs8] [--libs16] [--libs32] [--libs-posix] + [--cflags] [--cflags-posix] +
++pcre2-config returns the configuration of the installed PCRE2 libraries +and the options required to compile a program to use them. Some of the options +apply only to the 8-bit, or 16-bit, or 32-bit libraries, respectively, and are +not available for libraries that have not been built. If an unavailable option +is encountered, the "usage" information is output. +
++--prefix +Writes the directory prefix used in the PCRE2 installation for architecture +independent files (/usr on many systems, /usr/local on some +systems) to the standard output. +
++--exec-prefix +Writes the directory prefix used in the PCRE2 installation for architecture +dependent files (normally the same as --prefix) to the standard output. +
++--version +Writes the version number of the installed PCRE2 libraries to the standard +output. +
++--libs8 +Writes to the standard output the command line options required to link +with the 8-bit PCRE2 library (-lpcre2-8 on many systems). +
++--libs16 +Writes to the standard output the command line options required to link +with the 16-bit PCRE2 library (-lpcre2-16 on many systems). +
++--libs32 +Writes to the standard output the command line options required to link +with the 32-bit PCRE2 library (-lpcre2-32 on many systems). +
++--libs-posix +Writes to the standard output the command line options required to link with +PCRE2's POSIX API wrapper library (-lpcre2-posix -lpcre2-8 on many +systems). +
++--cflags +Writes to the standard output the command line options required to compile +files that use PCRE2 (this may include some -I options, but is blank on +many systems). +
++--cflags-posix +Writes to the standard output the command line options required to compile +files that use PCRE2's POSIX API wrapper library (this may include some +-I options, but is blank on many systems). +
++pcre2(3) +
++This manual page was originally written by Mark Baker for the Debian GNU/Linux +system. It has been subsequently revised as a generic PCRE2 man page. +
+
+Last updated: 28 September 2014
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2.html b/doc/html/pcre2.html new file mode 100644 index 0000000..4cb83dc --- /dev/null +++ b/doc/html/pcre2.html @@ -0,0 +1,214 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+PCRE2 is the name used for a revised API for the PCRE library, which is a set +of functions, written in C, that implement regular expression pattern matching +using the same syntax and semantics as Perl, with just a few differences. After +nearly two decades, the limitations of the original API were making development +increasingly difficult. The new API is more extensible, and it was simplified +by abolishing the separate "study" optimizing function; in PCRE2, patterns are +automatically optimized where possible. Since forking from PCRE1, the code has +been extensively refactored and new features introduced. The old library is now +obsolete and is no longer maintained. +
++As well as Perl-style regular expression patterns, some features that appeared +in Python and the original PCRE before they appeared in Perl are available +using the Python syntax. There is also some support for one or two .NET and +Oniguruma syntax items, and there are options for requesting some minor changes +that give better ECMAScript (aka JavaScript) compatibility. +
++The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit, +or 32-bit code units, which means that up to three separate libraries may be +installed, one for each code unit size. The size of code unit is not related to +the bit size of the underlying hardware. In a 64-bit environment that also +supports 32-bit applications, versions of PCRE2 that are compiled in both +64-bit and 32-bit modes may be needed. +
++The original work to extend PCRE to 16-bit and 32-bit code units was done by +Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings +can be interpreted either as one character per code unit, or as UTF-encoded +Unicode, with support for Unicode general category properties. Unicode support +is optional at build time (but is the default). However, processing strings as +UTF code units must be enabled explicitly at run time. The version of Unicode +in use can be discovered by running +
+ pcre2test -C ++ +
+The three libraries contain identical sets of functions, with names ending in +_8, _16, or _32, respectively (for example, pcre2_compile_8()). However, +by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just +one code unit width can be written using generic names such as +pcre2_compile(), and the documentation is written assuming that this is +the case. +
++In addition to the Perl-compatible matching function, PCRE2 contains an +alternative function that matches the same compiled patterns in a different +way. In certain circumstances, the alternative function has some advantages. +For a discussion of the two matching algorithms, see the +pcre2matching +page. +
++Details of exactly which Perl regular expression features are and are not +supported by PCRE2 are given in separate documents. See the +pcre2pattern +and +pcre2compat +pages. There is a syntax summary in the +pcre2syntax +page. +
++Some features of PCRE2 can be included, excluded, or changed when the library +is built. The +pcre2_config() +function makes it possible for a client to discover which features are +available. The features themselves are described in the +pcre2build +page. Documentation about building PCRE2 for various operating systems can be +found in the +README +and +NON-AUTOTOOLS_BUILD +files in the source distribution. +
++The libraries contains a number of undocumented internal functions and data +tables that are used by more than one of the exported external functions, but +which are not intended for use by external callers. Their names all begin with +"_pcre2", which hopefully will not provoke any name clashes. In some +environments, it is possible to control which external symbols are exported +when a shared library is built, and in these cases the undocumented symbols are +not exported. +
++If you are using PCRE2 in a non-UTF application that permits users to supply +arbitrary patterns for compilation, you should be aware of a feature that +allows users to turn on UTF support from within a pattern. For example, an +8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets +patterns and subjects as strings of UTF-8 code units instead of individual +8-bit characters. This causes both the pattern and any data against which it is +matched to be checked for UTF-8 validity. If the data string is very long, such +a check might use sufficiently many resources as to cause your application to +lose performance. +
++One way of guarding against this possibility is to use the +pcre2_pattern_info() function to check the compiled pattern's options for +PCRE2_UTF. Alternatively, you can set the PCRE2_NEVER_UTF option when calling +pcre2_compile(). This causes a compile time error if the pattern contains +a UTF-setting sequence. +
++The use of Unicode properties for character types such as \d can also be +enabled from within the pattern, by specifying "(*UCP)". This feature can be +disallowed by setting the PCRE2_NEVER_UCP option. +
++If your application is one that supports UTF, be aware that validity checking +can take time. If the same data string is to be matched many times, you can use +the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid +running redundant checks. +
++The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to +problems, because it may leave the current matching point in the middle of a +multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an +application to lock out the use of \C, causing a compile-time error if it is +encountered. It is also possible to build PCRE2 with the use of \C permanently +disabled. +
++Another way that performance can be hit is by running a pattern that has a very +large search tree against a string that will never match. Nested unlimited +repeats in a pattern are a common example. PCRE2 provides some protection +against this: see the pcre2_set_match_limit() function in the +pcre2api +page. There is a similar function called pcre2_set_depth_limit() that can +be used to restrict the amount of memory that is used. +
++The user documentation for PCRE2 comprises a number of different sections. In +the "man" format, each of these is a separate "man page". In the HTML format, +each is a separate page, linked from the index page. In the plain text format, +the descriptions of the pcre2grep and pcre2test programs are in +files called pcre2grep.txt and pcre2test.txt, respectively. The +remaining sections, except for the pcre2demo section (which is a program +listing), and the short pages for individual functions, are concatenated in +pcre2.txt, for ease of searching. The sections are as follows: +
+ pcre2 this document + pcre2-config show PCRE2 installation configuration information + pcre2api details of PCRE2's native C API + pcre2build building PCRE2 + pcre2callout details of the pattern callout feature + pcre2compat discussion of Perl compatibility + pcre2convert details of pattern conversion functions + pcre2demo a demonstration C program that uses PCRE2 + pcre2grep description of the pcre2grep command (8-bit only) + pcre2jit discussion of just-in-time optimization support + pcre2limits details of size and other limits + pcre2matching discussion of the two matching algorithms + pcre2partial details of the partial matching facility + pcre2pattern syntax and semantics of supported regular expression patterns + pcre2perform discussion of performance issues + pcre2posix the POSIX-compatible C API for the 8-bit library + pcre2sample discussion of the pcre2demo program + pcre2serialize details of pattern serialization + pcre2syntax quick syntax reference + pcre2test description of the pcre2test command + pcre2unicode discussion of Unicode and UTF support ++In the "man" and HTML formats, there is also a short page for each C library +function, listing its arguments and results. + +
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Putting an actual email address here is a spam magnet. If you want to email me, +use my two names separated by a dot at gmail.com. +
+
+Last updated: 27 August 2021
+
+Copyright © 1997-2021 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_callout_enumerate.html b/doc/html/pcre2_callout_enumerate.html new file mode 100644 index 0000000..505ea7b --- /dev/null +++ b/doc/html/pcre2_callout_enumerate.html @@ -0,0 +1,63 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_callout_enumerate(const pcre2_code *code, + int (*callback)(pcre2_callout_enumerate_block *, void *), + void *callout_data); +
++This function scans a compiled regular expression and calls the callback() +function for each callout within the pattern. The yield of the function is zero +for success and non-zero otherwise. The arguments are: +
+ code Points to the compiled pattern + callback The callback function + callout_data User data that is passed to the callback ++The callback() function is passed a pointer to a data block containing +the following fields (not necessarily in this order): +
+ uint32_t version Block version number + uint32_t callout_number Number for numbered callouts + PCRE2_SIZE pattern_position Offset to next item in pattern + PCRE2_SIZE next_item_length Length of next item in pattern + PCRE2_SIZE callout_string_offset Offset to string within pattern + PCRE2_SIZE callout_string_length Length of callout string + PCRE2_SPTR callout_string Points to callout string or is NULL ++The second argument passed to the callback() function is the callout data +that was passed to pcre2_callout_enumerate(). The callback() +function must return zero for success. Any other value causes the pattern scan +to stop, with the value being passed back as the result of +pcre2_callout_enumerate(). + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_code_copy.html b/doc/html/pcre2_code_copy.html new file mode 100644 index 0000000..667d7b7 --- /dev/null +++ b/doc/html/pcre2_code_copy.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_code *pcre2_code_copy(const pcre2_code *code); +
++This function makes a copy of the memory used for a compiled pattern, excluding +any memory used by the JIT compiler. Without a subsequent call to +pcre2_jit_compile(), the copy can be used only for non-JIT matching. The +pointer to the character tables is copied, not the tables themselves (see +pcre2_code_copy_with_tables()). The yield of the function is NULL if +code is NULL or if sufficient memory cannot be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_code_copy_with_tables.html b/doc/html/pcre2_code_copy_with_tables.html new file mode 100644 index 0000000..67b2e1f --- /dev/null +++ b/doc/html/pcre2_code_copy_with_tables.html @@ -0,0 +1,44 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); +
++This function makes a copy of the memory used for a compiled pattern, excluding +any memory used by the JIT compiler. Without a subsequent call to +pcre2_jit_compile(), the copy can be used only for non-JIT matching. +Unlike pcre2_code_copy(), a separate copy of the character tables is also +made, with the new code pointing to it. This memory will be automatically freed +when pcre2_code_free() is called. The yield of the function is NULL if +code is NULL or if sufficient memory cannot be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_code_free.html b/doc/html/pcre2_code_free.html new file mode 100644 index 0000000..ff302fc --- /dev/null +++ b/doc/html/pcre2_code_free.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_code_free(pcre2_code *code); +
++If code is NULL, this function does nothing. Otherwise, code must +point to a compiled pattern. This function frees its memory, including any +memory used by the JIT compiler. If the compiled pattern was created by a call +to pcre2_code_copy_with_tables(), the memory for the character tables is +also freed. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_compile.html b/doc/html/pcre2_compile.html new file mode 100644 index 0000000..f0080ea --- /dev/null +++ b/doc/html/pcre2_compile.html @@ -0,0 +1,119 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, + pcre2_compile_context *ccontext); +
++This function compiles a regular expression pattern into an internal form. Its +arguments are: +
+ pattern A string containing expression to be compiled + length The length of the string or PCRE2_ZERO_TERMINATED + options Primary option bits + errorcode Where to put an error code + erroffset Where to put an error offset + ccontext Pointer to a compile context or NULL ++The length of the pattern and any error offset that is returned are in code +units, not characters. A NULL pattern with zero length is treated as an empty +string. A compile context is needed only if you want to provide custom memory +allocation functions, or to provide an external function for system stack size +checking (see pcre2_set_compile_recursion_guard()), or to change one or +more of these parameters: +
+ What \R matches (Unicode newlines, or CR, LF, CRLF only); + PCRE2's character tables; + The newline character sequence; + The compile time nested parentheses limit; + The maximum pattern length (in code units) that is allowed; + The additional options bits. ++The primary option bits are: +
+ PCRE2_ANCHORED Force pattern anchoring + PCRE2_ALLOW_EMPTY_CLASS Allow empty classes + PCRE2_ALT_BSUX Alternative handling of \u, \U, and \x + PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode + PCRE2_ALT_VERBNAMES Process backslashes in verb names + PCRE2_AUTO_CALLOUT Compile automatic callouts + PCRE2_CASELESS Do caseless matching + PCRE2_DOLLAR_ENDONLY $ not to match newline at end + PCRE2_DOTALL . matches anything including NL + PCRE2_DUPNAMES Allow duplicate names for subpatterns + PCRE2_ENDANCHORED Pattern can match only at end of subject + PCRE2_EXTENDED Ignore white space and # comments + PCRE2_FIRSTLINE Force matching to be before newline + PCRE2_LITERAL Pattern characters are all literal + PCRE2_MATCH_INVALID_UTF Enable support for matching invalid UTF + PCRE2_MATCH_UNSET_BACKREF Match unset backreferences + PCRE2_MULTILINE ^ and $ match newlines within data + PCRE2_NEVER_BACKSLASH_C Lock out the use of \C in patterns + PCRE2_NEVER_UCP Lock out PCRE2_UCP, e.g. via (*UCP) + PCRE2_NEVER_UTF Lock out PCRE2_UTF, e.g. via (*UTF) + PCRE2_NO_AUTO_CAPTURE Disable numbered capturing paren- + theses (named ones available) + PCRE2_NO_AUTO_POSSESS Disable auto-possessification + PCRE2_NO_DOTSTAR_ANCHOR Disable automatic anchoring for .* + PCRE2_NO_START_OPTIMIZE Disable match-time start optimizations + PCRE2_NO_UTF_CHECK Do not check the pattern for UTF validity + (only relevant if PCRE2_UTF is set) + PCRE2_UCP Use Unicode properties for \d, \w, etc. + PCRE2_UNGREEDY Invert greediness of quantifiers + PCRE2_USE_OFFSET_LIMIT Enable offset limit for unanchored matching + PCRE2_UTF Treat pattern and subjects as UTF strings ++PCRE2 must be built with Unicode support (the default) in order to use +PCRE2_UTF, PCRE2_UCP and related options. + +
+Additional options may be set in the compile context via the +pcre2_set_compile_extra_options +function. +
++If either of errorcode or erroroffset is NULL, the function returns +NULL immediately. Otherwise, the yield of this function is a pointer to a +private data structure that contains the compiled pattern, or NULL if an error +was detected. In the error case, a text error message can be obtained by +passing the value returned via the errorcode argument to the +pcre2_get_error_message() function. The offset (in code units) where the +error was encountered is returned via the erroroffset argument. +
++If there is no error, the value passed via errorcode returns the message +"no error" if passed to pcre2_get_error_message(), and the value passed +via erroroffset is zero. +
++There is a complete description of the PCRE2 native API, with more detail on +each option, in the +pcre2api +page, and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_compile_context_copy.html b/doc/html/pcre2_compile_context_copy.html new file mode 100644 index 0000000..9e9884b --- /dev/null +++ b/doc/html/pcre2_compile_context_copy.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_compile_context *pcre2_compile_context_copy( + pcre2_compile_context *ccontext); +
++This function makes a new copy of a compile context, using the memory +allocation function that was used for the original context. The result is NULL +if the memory cannot be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_compile_context_create.html b/doc/html/pcre2_compile_context_create.html new file mode 100644 index 0000000..5eacd4e --- /dev/null +++ b/doc/html/pcre2_compile_context_create.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_compile_context *pcre2_compile_context_create( + pcre2_general_context *gcontext); +
++This function creates and initializes a new compile context. If its argument is +NULL, malloc() is used to get the necessary memory; otherwise the memory +allocation function within the general context is used. The result is NULL if +the memory could not be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_compile_context_free.html b/doc/html/pcre2_compile_context_free.html new file mode 100644 index 0000000..b4159b1 --- /dev/null +++ b/doc/html/pcre2_compile_context_free.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_compile_context_free(pcre2_compile_context *ccontext); +
++This function frees the memory occupied by a compile context, using the memory +freeing function from the general context with which it was created, or +free() if that was not set. If the argument is NULL, the function returns +immediately without doing anything. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_config.html b/doc/html/pcre2_config.html new file mode 100644 index 0000000..f05bd06 --- /dev/null +++ b/doc/html/pcre2_config.html @@ -0,0 +1,84 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_config(uint32_t what, void *where); +
++This function makes it possible for a client program to find out which optional +features are available in the version of the PCRE2 library it is using. The +arguments are as follows: +
+ what A code specifying what information is required + where Points to where to put the information ++If where is NULL, the function returns the amount of memory needed for +the requested information. When the information is a string, the value is in +code units; for other types of data it is in bytes. + +
+If where is not NULL, for PCRE2_CONFIG_JITTARGET, +PCRE2_CONFIG_UNICODE_VERSION, and PCRE2_CONFIG_VERSION it must point to a +buffer that is large enough to hold the string. For all other codes it must +point to a uint32_t integer variable. The available codes are: +
+ PCRE2_CONFIG_BSR Indicates what \R matches by default: + PCRE2_BSR_UNICODE + PCRE2_BSR_ANYCRLF + PCRE2_CONFIG_COMPILED_WIDTHS Which of 8/16/32 support was compiled + PCRE2_CONFIG_DEPTHLIMIT Default backtracking depth limit + PCRE2_CONFIG_HEAPLIMIT Default heap memory limit + PCRE2_CONFIG_JIT Availability of just-in-time compiler support (1=yes 0=no) + PCRE2_CONFIG_JITTARGET Information (a string) about the target architecture for the JIT compiler + PCRE2_CONFIG_LINKSIZE Configured internal link size (2, 3, 4) + PCRE2_CONFIG_MATCHLIMIT Default internal resource limit + PCRE2_CONFIG_NEVER_BACKSLASH_C Whether or not \C is disabled + PCRE2_CONFIG_NEWLINE Code for the default newline sequence: + PCRE2_NEWLINE_CR + PCRE2_NEWLINE_LF + PCRE2_NEWLINE_CRLF + PCRE2_NEWLINE_ANY + PCRE2_NEWLINE_ANYCRLF + PCRE2_NEWLINE_NUL + PCRE2_CONFIG_PARENSLIMIT Default parentheses nesting limit + PCRE2_CONFIG_RECURSIONLIMIT Obsolete: use PCRE2_CONFIG_DEPTHLIMIT + PCRE2_CONFIG_STACKRECURSE Obsolete: always returns 0 + PCRE2_CONFIG_UNICODE Availability of Unicode support (1=yes 0=no) + PCRE2_CONFIG_UNICODE_VERSION The Unicode version (a string) + PCRE2_CONFIG_VERSION The PCRE2 version (a string) ++The function yields a non-negative value on success or the negative value +PCRE2_ERROR_BADOPTION otherwise. This is also the result for the +PCRE2_CONFIG_JITTARGET code if JIT support is not available. When a string is +requested, the function returns the number of code units used, including the +terminating zero. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_convert_context_copy.html b/doc/html/pcre2_convert_context_copy.html new file mode 100644 index 0000000..3c44ac6 --- /dev/null +++ b/doc/html/pcre2_convert_context_copy.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_convert_context *pcre2_convert_context_copy( + pcre2_convert_context *cvcontext); +
++This function is part of an experimental set of pattern conversion functions. +It makes a new copy of a convert context, using the memory allocation function +that was used for the original context. The result is NULL if the memory cannot +be obtained. +
++The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_convert_context_create.html b/doc/html/pcre2_convert_context_create.html new file mode 100644 index 0000000..2564780 --- /dev/null +++ b/doc/html/pcre2_convert_context_create.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_convert_context *pcre2_convert_context_create( + pcre2_general_context *gcontext); +
++This function is part of an experimental set of pattern conversion functions. +It creates and initializes a new convert context. If its argument is +NULL, malloc() is used to get the necessary memory; otherwise the memory +allocation function within the general context is used. The result is NULL if +the memory could not be obtained. +
++The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_convert_context_free.html b/doc/html/pcre2_convert_context_free.html new file mode 100644 index 0000000..e9b142b --- /dev/null +++ b/doc/html/pcre2_convert_context_free.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_convert_context_free(pcre2_convert_context *cvcontext); +
++This function is part of an experimental set of pattern conversion functions. +It frees the memory occupied by a convert context, using the memory +freeing function from the general context with which it was created, or +free() if that was not set. If the argument is NULL, the function returns +immediately without doing anything. +
++The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_converted_pattern_free.html b/doc/html/pcre2_converted_pattern_free.html new file mode 100644 index 0000000..01d28d7 --- /dev/null +++ b/doc/html/pcre2_converted_pattern_free.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern); +
++This function is part of an experimental set of pattern conversion functions. +It frees the memory occupied by a converted pattern that was obtained by +calling pcre2_pattern_convert() with arguments that caused it to place +the converted pattern into newly obtained heap memory. If the argument is NULL, +the function returns immediately without doing anything. +
++The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_dfa_match.html b/doc/html/pcre2_dfa_match.html new file mode 100644 index 0000000..0ae428c --- /dev/null +++ b/doc/html/pcre2_dfa_match.html @@ -0,0 +1,86 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, + int *workspace, PCRE2_SIZE wscount); +
++This function matches a compiled regular expression against a given subject +string, using an alternative matching algorithm that scans the subject string +just once (except when processing lookaround assertions). This function is +not Perl-compatible (the Perl-compatible matching function is +pcre2_match()). The arguments for this function are: +
+ code Points to the compiled pattern + subject Points to the subject string + length Length of the subject string + startoffset Offset in the subject at which to start matching + options Option bits + match_data Points to a match data block, for results + mcontext Points to a match context, or is NULL + workspace Points to a vector of ints used as working space + wscount Number of elements in the vector ++The size of output vector needed to contain all the results depends on the +number of simultaneous matches, not on the number of parentheses in the +pattern. Using pcre2_match_data_create_from_pattern() to create the match +data block is therefore not advisable when using this function. + +
+A match context is needed only if you want to set up a callout function or +specify the heap limit or the match or the recursion depth limits. The +length and startoffset values are code units, not characters. The +options are: +
+ PCRE2_ANCHORED Match only at the first position + PCRE2_COPY_MATCHED_SUBJECT + On success, make a private subject copy + PCRE2_ENDANCHORED Pattern can match only at end of subject + PCRE2_NOTBOL Subject is not the beginning of a line + PCRE2_NOTEOL Subject is not the end of a line + PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match + PCRE2_NO_UTF_CHECK Do not check the subject for UTF validity (only relevant if PCRE2_UTF + was set at compile time) + PCRE2_PARTIAL_HARD Return PCRE2_ERROR_PARTIAL for a partial match even if there is a full match + PCRE2_PARTIAL_SOFT Return PCRE2_ERROR_PARTIAL for a partial match if no full matches are found + PCRE2_DFA_RESTART Restart after a partial match + PCRE2_DFA_SHORTEST Return only the shortest match ++There are restrictions on what may appear in a pattern when using this matching +function. Details are given in the +pcre2matching +documentation. For details of partial matching, see the +pcre2partial +page. There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_general_context_copy.html b/doc/html/pcre2_general_context_copy.html new file mode 100644 index 0000000..0018534 --- /dev/null +++ b/doc/html/pcre2_general_context_copy.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_general_context *pcre2_general_context_copy( + pcre2_general_context *gcontext); +
++This function makes a new copy of a general context, using the memory +allocation functions in the context, if set, to get the necessary memory. +Otherwise malloc() is used. The result is NULL if the memory cannot be +obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_general_context_create.html b/doc/html/pcre2_general_context_create.html new file mode 100644 index 0000000..a1a165d --- /dev/null +++ b/doc/html/pcre2_general_context_create.html @@ -0,0 +1,44 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_general_context *pcre2_general_context_create( + void *(*private_malloc)(size_t, void *), + void (*private_free)(void *, void *), void *memory_data); +
++This function creates and initializes a general context. The arguments define +custom memory management functions and a data value that is passed to them when +they are called. The private_malloc() function is used to get memory for +the context. If either of the first two arguments is NULL, the system memory +management function is used. The result is NULL if no memory could be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_general_context_free.html b/doc/html/pcre2_general_context_free.html new file mode 100644 index 0000000..9f335f5 --- /dev/null +++ b/doc/html/pcre2_general_context_free.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_general_context_free(pcre2_general_context *gcontext); +
++This function frees the memory occupied by a general context, using the memory +freeing function within the context, if set. If the argument is NULL, the +function returns immediately without doing anything. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_error_message.html b/doc/html/pcre2_get_error_message.html new file mode 100644 index 0000000..7005760 --- /dev/null +++ b/doc/html/pcre2_get_error_message.html @@ -0,0 +1,51 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, + PCRE2_SIZE bufflen); +
++This function provides a textual error message for each PCRE2 error code. +Compilation errors are positive numbers; UTF formatting errors and matching +errors are negative numbers. The arguments are: +
+ errorcode an error code (positive or negative) + buffer where to put the message + bufflen the length of the buffer (code units) ++The function returns the length of the message in code units, excluding the +trailing zero, or the negative error code PCRE2_ERROR_NOMEMORY if the buffer is +too small. In this case, the returned message is truncated (but still with a +trailing zero). If errorcode does not contain a recognized error code +number, the negative value PCRE2_ERROR_BADDATA is returned. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_mark.html b/doc/html/pcre2_get_mark.html new file mode 100644 index 0000000..88e6326 --- /dev/null +++ b/doc/html/pcre2_get_mark.html @@ -0,0 +1,47 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data); +
++After a call of pcre2_match() that was passed the match block that is +this function's argument, this function returns a pointer to the last (*MARK), +(*PRUNE), or (*THEN) name that was encountered during the matching process. The +name is zero-terminated, and is within the compiled pattern. The length of the +name is in the preceding code unit. If no name is available, NULL is returned. +
++After a successful match, the name that is returned is the last one on the +matching path. After a failed match or a partial match, the last encountered +name is returned. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_match_data_heapframes_size.html b/doc/html/pcre2_get_match_data_heapframes_size.html new file mode 100644 index 0000000..3c705c6 --- /dev/null +++ b/doc/html/pcre2_get_match_data_heapframes_size.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++PCRE2_SIZE pcre2_get_match_data_heapframes_size( + pcre2_match_data *match_data); +
++This function returns the size, in bytes, of the heapframes data block that is +owned by its argument. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_match_data_size.html b/doc/html/pcre2_get_match_data_size.html new file mode 100644 index 0000000..113ecaa --- /dev/null +++ b/doc/html/pcre2_get_match_data_size.html @@ -0,0 +1,39 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data); +
++This function returns the size, in bytes, of the match data block that is its +argument. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_ovector_count.html b/doc/html/pcre2_get_ovector_count.html new file mode 100644 index 0000000..05aacb6 --- /dev/null +++ b/doc/html/pcre2_get_ovector_count.html @@ -0,0 +1,39 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data); +
++This function returns the number of pairs of offsets in the ovector that forms +part of the given match data block. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_ovector_pointer.html b/doc/html/pcre2_get_ovector_pointer.html new file mode 100644 index 0000000..ff6317e --- /dev/null +++ b/doc/html/pcre2_get_ovector_pointer.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data); +
++This function returns a pointer to the vector of offsets that forms part of the +given match data block. The number of pairs can be found by calling +pcre2_get_ovector_count(). +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_get_startchar.html b/doc/html/pcre2_get_startchar.html new file mode 100644 index 0000000..d2c28b2 --- /dev/null +++ b/doc/html/pcre2_get_startchar.html @@ -0,0 +1,44 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data); +
++After a successful call of pcre2_match() that was passed the match block +that is this function's argument, this function returns the code unit offset of +the character at which the successful match started. For a non-partial match, +this can be different to the value of ovector[0] if the pattern contains +the \K escape sequence. After a partial match, however, this value is always +the same as ovector[0] because \K does not affect the result of a +partial match. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_jit_compile.html b/doc/html/pcre2_jit_compile.html new file mode 100644 index 0000000..873d0dd --- /dev/null +++ b/doc/html/pcre2_jit_compile.html @@ -0,0 +1,63 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_jit_compile(pcre2_code *code, uint32_t options); +
++This function requests JIT compilation, which, if the just-in-time compiler is +available, further processes a compiled pattern into machine code that executes +much faster than the pcre2_match() interpretive matching function. Full +details are given in the +pcre2jit +documentation. +
++The first argument is a pointer that was returned by a successful call to +pcre2_compile(), and the second must contain one or more of the following +bits: +
+ PCRE2_JIT_COMPLETE compile code for full matching + PCRE2_JIT_PARTIAL_SOFT compile code for soft partial matching + PCRE2_JIT_PARTIAL_HARD compile code for hard partial matching ++There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been +superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF. The old +option is deprecated and may be removed in the future. + +
+The yield of the function is 0 for success, or a negative error code otherwise. +In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or +if an unknown bit is set in options. The function can also return +PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the +compiler, even if it was because of a system security restriction. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_jit_free_unused_memory.html b/doc/html/pcre2_jit_free_unused_memory.html new file mode 100644 index 0000000..7f37e58 --- /dev/null +++ b/doc/html/pcre2_jit_free_unused_memory.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); +
++This function frees unused JIT executable memory. The argument is a general +context, for custom memory management, or NULL for standard memory management. +JIT memory allocation retains some memory in order to improve future JIT +compilation speed. In low memory conditions, +pcre2_jit_free_unused_memory() can be used to cause this memory to be +freed. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_jit_match.html b/doc/html/pcre2_jit_match.html new file mode 100644 index 0000000..56144ff --- /dev/null +++ b/doc/html/pcre2_jit_match.html @@ -0,0 +1,70 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext); +
++This function matches a compiled regular expression that has been successfully +processed by the JIT compiler against a given subject string, using a matching +algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and +it bypasses some of the sanity checks that pcre2_match() applies. +
++In UTF mode, the subject string is not checked for UTF validity. Unless +PCRE2_MATCH_INVALID_UTF was set when the pattern was compiled, passing an +invalid UTF string results in undefined behaviour. Your program may crash or +loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you +should only call pcre2_jit_match() in UTF mode if you are sure the +subject is valid. +
++The arguments for pcre2_jit_match() are exactly the same as for +pcre2_match(), +except that the subject string must be specified with a length; +PCRE2_ZERO_TERMINATED is not supported. +
++The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, +PCRE2_NOTEMPTY_ATSTART, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Unsupported +options are ignored. +
++The return values are the same as for pcre2_match() plus +PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial or complete) is requested +that was not compiled. For details of partial matching, see the +pcre2partial +page. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the JIT API in the +pcre2jit +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_jit_stack_assign.html b/doc/html/pcre2_jit_stack_assign.html new file mode 100644 index 0000000..4b3abb9 --- /dev/null +++ b/doc/html/pcre2_jit_stack_assign.html @@ -0,0 +1,75 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_jit_stack_assign(pcre2_match_context *mcontext, + pcre2_jit_callback callback_function, void *callback_data); +
++This function provides control over the memory used by JIT as a run-time stack +when pcre2_match() or pcre2_jit_match() is called with a pattern +that has been successfully processed by the JIT compiler. The information that +determines which stack is used is put into a match context that is subsequently +passed to a matching function. The arguments of this function are: +
+ mcontext a pointer to a match context + callback a callback function + callback_data a JIT stack or a value to be passed to the callback ++ +
+If mcontext is NULL, the function returns immediately, without doing +anything. +
++If callback is NULL and callback_data is NULL, an internal 32KiB +block on the machine stack is used. +
++If callback is NULL and callback_data is not NULL, +callback_data must be a valid JIT stack, the result of calling +pcre2_jit_stack_create(). +
++If callback not NULL, it is called with callback_data as an +argument at the start of matching, in order to set up a JIT stack. If the +result is NULL, the internal 32KiB stack is used; otherwise the return value +must be a valid JIT stack, the result of calling +pcre2_jit_stack_create(). +
++You may safely use the same JIT stack for multiple patterns, as long as they +are all matched in the same thread. In a multithread application, each thread +must use its own JIT stack. For more details, see the +pcre2jit +page. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_jit_stack_create.html b/doc/html/pcre2_jit_stack_create.html new file mode 100644 index 0000000..b9dc59d --- /dev/null +++ b/doc/html/pcre2_jit_stack_create.html @@ -0,0 +1,50 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize, + size_t maxsize, pcre2_general_context *gcontext); +
++This function is used to create a stack for use by the code compiled by the JIT +compiler. The first two arguments are a starting size for the stack, and a +maximum size to which it is allowed to grow. The final argument is a general +context, for memory allocation functions, or NULL for standard memory +allocation. The result can be passed to the JIT run-time code by calling +pcre2_jit_stack_assign() to associate the stack with a compiled pattern, +which can then be processed by pcre2_match() or pcre2_jit_match(). +A maximum stack size of 512KiB to 1MiB should be more than enough for any +pattern. If the stack couldn't be allocated or the values passed were not +reasonable, NULL will be returned. For more details, see the +pcre2jit +page. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_jit_stack_free.html b/doc/html/pcre2_jit_stack_free.html new file mode 100644 index 0000000..1d078d7 --- /dev/null +++ b/doc/html/pcre2_jit_stack_free.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack); +
++This function is used to free a JIT stack that was created by +pcre2_jit_stack_create() when it is no longer needed. If the argument is +NULL, the function returns immediately without doing anything. For more +details, see the +pcre2jit +page. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_maketables.html b/doc/html/pcre2_maketables.html new file mode 100644 index 0000000..1963654 --- /dev/null +++ b/doc/html/pcre2_maketables.html @@ -0,0 +1,48 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++const uint8_t *pcre2_maketables(pcre2_general_context *gcontext); +
++This function builds a set of character tables for character code points that +are less than 256. These can be passed to pcre2_compile() in a compile +context in order to override the internal, built-in tables (which were either +defaulted or made by pcre2_maketables() when PCRE2 was compiled). See the +pcre2_set_character_tables() +page. You might want to do this if you are using a non-standard locale. +
++If the argument is NULL, malloc() is used to get memory for the tables. +Otherwise it must point to a general context, which can supply pointers to a +custom memory manager. The function yields a pointer to the tables. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_maketables_free.html b/doc/html/pcre2_maketables_free.html new file mode 100644 index 0000000..7316ab2 --- /dev/null +++ b/doc/html/pcre2_maketables_free.html @@ -0,0 +1,44 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_maketables_free(pcre2_general_context *gcontext, + const uint8_t *tables); +
++This function discards a set of character tables that were created by a call +to +pcre2_maketables(). +
++The gcontext parameter should match what was used in that call to +account for any custom allocators that might be in use; if it is NULL +the system free() is used. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match.html b/doc/html/pcre2_match.html new file mode 100644 index 0000000..5584ae3 --- /dev/null +++ b/doc/html/pcre2_match.html @@ -0,0 +1,87 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext); +
++This function matches a compiled regular expression against a given subject +string, using a matching algorithm that is similar to Perl's. It returns +offsets to what it has matched and to captured substrings via the +match_data block, which can be processed by functions with names that +start with pcre2_get_ovector_...() or pcre2_substring_...(). The +return from pcre2_match() is one more than the highest numbered capturing +pair that has been set (for example, 1 if there are no captures), zero if the +vector of offsets is too small, or a negative error code for no match and other +errors. The function arguments are: +
+ code Points to the compiled pattern + subject Points to the subject string + length Length of the subject string + startoffset Offset in the subject at which to start matching + options Option bits + match_data Points to a match data block, for results + mcontext Points to a match context, or is NULL ++A match context is needed only if you want to: +
+ Set up a callout function + Set a matching offset limit + Change the heap memory limit + Change the backtracking match limit + Change the backtracking depth limit + Set custom memory management specifically for the match ++The length and startoffset values are code units, not characters. +The length may be given as PCRE2_ZERO_TERMINATED for a subject that is +terminated by a binary zero code unit. The options are: +
+ PCRE2_ANCHORED Match only at the first position + PCRE2_COPY_MATCHED_SUBJECT + On success, make a private subject copy + PCRE2_DISABLE_RECURSELOOP_CHECK + Only useful in rare cases; use with care + PCRE2_ENDANCHORED Pattern can match only at end of subject + PCRE2_NOTBOL Subject string is not the beginning of a line + PCRE2_NOTEOL Subject string is not the end of a line + PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match + PCRE2_NO_JIT Do not use JIT matching + PCRE2_NO_UTF_CHECK Do not check the subject for UTF validity (only relevant if PCRE2_UTF + was set at compile time) + PCRE2_PARTIAL_HARD Return PCRE2_ERROR_PARTIAL for a partial match even if there is a full match + PCRE2_PARTIAL_SOFT Return PCRE2_ERROR_PARTIAL for a partial match if no full matches are found ++For details of partial matching, see the +pcre2partial +page. There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match_context_copy.html b/doc/html/pcre2_match_context_copy.html new file mode 100644 index 0000000..4a719d6 --- /dev/null +++ b/doc/html/pcre2_match_context_copy.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_match_context *pcre2_match_context_copy( + pcre2_match_context *mcontext); +
++This function makes a new copy of a match context, using the memory +allocation function that was used for the original context. The result is NULL +if the memory cannot be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match_context_create.html b/doc/html/pcre2_match_context_create.html new file mode 100644 index 0000000..f7f2735 --- /dev/null +++ b/doc/html/pcre2_match_context_create.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_match_context *pcre2_match_context_create( + pcre2_general_context *gcontext); +
++This function creates and initializes a new match context. If its argument is +NULL, malloc() is used to get the necessary memory; otherwise the memory +allocation function within the general context is used. The result is NULL if +the memory could not be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match_context_free.html b/doc/html/pcre2_match_context_free.html new file mode 100644 index 0000000..7f00ea9 --- /dev/null +++ b/doc/html/pcre2_match_context_free.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_match_context_free(pcre2_match_context *mcontext); +
++This function frees the memory occupied by a match context, using the memory +freeing function from the general context with which it was created, or +free() if that was not set. If the argument is NULL, the function returns +immediately without doing anything. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match_data_create.html b/doc/html/pcre2_match_data_create.html new file mode 100644 index 0000000..c26c3b3 --- /dev/null +++ b/doc/html/pcre2_match_data_create.html @@ -0,0 +1,50 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, + pcre2_general_context *gcontext); +
++This function creates a new match data block, which is used for holding the +result of a match. The first argument specifies the number of pairs of offsets +that are required. These form the "output vector" (ovector) within the match +data block, and are used to identify the matched string and any captured +substrings when matching with pcre2_match(), or a number of different +matches at the same point when used with pcre2_dfa_match(). There is +always one pair of offsets; if ovecsize is zero, it is treated as one. +
++The second argument points to a general context, for custom memory management, +or is NULL for system memory management. The result of the function is NULL if +the memory for the block could not be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match_data_create_from_pattern.html b/doc/html/pcre2_match_data_create_from_pattern.html new file mode 100644 index 0000000..db58ab9 --- /dev/null +++ b/doc/html/pcre2_match_data_create_from_pattern.html @@ -0,0 +1,53 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++pcre2_match_data *pcre2_match_data_create_from_pattern( + const pcre2_code *code, pcre2_general_context *gcontext); +
++This function creates a new match data block for holding the result of a match. +The first argument points to a compiled pattern. The number of capturing +parentheses within the pattern is used to compute the number of pairs of +offsets that are required in the match data block. These form the "output +vector" (ovector) within the match data block, and are used to identify the +matched string and any captured substrings when matching with +pcre2_match(). If you are using pcre2_dfa_match(), which uses the +output vector in a different way, you should use pcre2_match_data_create() +instead of this function. +
++The second argument points to a general context, for custom memory management, +or is NULL to use the same memory allocator as was used for the compiled +pattern. The result of the function is NULL if the memory for the block could +not be obtained. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_match_data_free.html b/doc/html/pcre2_match_data_free.html new file mode 100644 index 0000000..1c2520b --- /dev/null +++ b/doc/html/pcre2_match_data_free.html @@ -0,0 +1,48 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_match_data_free(pcre2_match_data *match_data); +
++If match_data is NULL, this function does nothing. Otherwise, +match_data must point to a match data block, which this function frees, +using the memory freeing function from the general context or compiled pattern +with which it was created, or free() if that was not set. If the match +data block was previously passed to pcre2_match(), it will have an +attached heapframe vector; this is also freed. +
++If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this +match data block, the copy of the subject that was referenced within the block +is also freed. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_pattern_convert.html b/doc/html/pcre2_pattern_convert.html new file mode 100644 index 0000000..2fcd7cc --- /dev/null +++ b/doc/html/pcre2_pattern_convert.html @@ -0,0 +1,70 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, PCRE2_UCHAR **buffer, + PCRE2_SIZE *blength, pcre2_convert_context *cvcontext); +
++This function is part of an experimental set of pattern conversion functions. +It converts a foreign pattern (for example, a glob) into a PCRE2 regular +expression pattern. Its arguments are: +
+ pattern The foreign pattern + length The length of the input pattern or PCRE2_ZERO_TERMINATED + options Option bits + buffer Pointer to pointer to output buffer, or NULL + blength Pointer to output length field + cvcontext Pointer to a convert context or NULL ++The length of the converted pattern (excluding the terminating zero) is +returned via blength. If buffer is NULL, the function just returns +the output length. If buffer points to a NULL pointer, heap memory is +obtained for the converted pattern, using the allocator in the context if +present (or else malloc()), and the field pointed to by buffer is +updated. If buffer points to a non-NULL field, that must point to a +buffer whose size is in the variable pointed to by blength. This value is +updated. + +
+The option bits are: +
+ PCRE2_CONVERT_UTF Input is UTF + PCRE2_CONVERT_NO_UTF_CHECK Do not check UTF validity + PCRE2_CONVERT_POSIX_BASIC Convert POSIX basic pattern + PCRE2_CONVERT_POSIX_EXTENDED Convert POSIX extended pattern + PCRE2_CONVERT_GLOB ) Convert + PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR ) various types + PCRE2_CONVERT_GLOB_NO_STARSTAR ) of glob ++The return value from pcre2_pattern_convert() is zero on success or a +non-zero PCRE2 error code. + +
+The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_pattern_info.html b/doc/html/pcre2_pattern_info.html new file mode 100644 index 0000000..eaaac6c --- /dev/null +++ b/doc/html/pcre2_pattern_info.html @@ -0,0 +1,109 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_pattern_info(const pcre2_code *code, uint32_t what, + void *where); +
++This function returns information about a compiled pattern. Its arguments are: +
+ code Pointer to a compiled regular expression pattern + what What information is required + where Where to put the information ++The recognized values for the what argument, and the information they +request are as follows: +
+ PCRE2_INFO_ALLOPTIONS Final options after compiling + PCRE2_INFO_ARGOPTIONS Options passed to pcre2_compile() + PCRE2_INFO_BACKREFMAX Number of highest backreference + PCRE2_INFO_BSR What \R matches: + PCRE2_BSR_UNICODE: Unicode line endings + PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only + PCRE2_INFO_CAPTURECOUNT Number of capturing subpatterns + PCRE2_INFO_DEPTHLIMIT Backtracking depth limit if set, otherwise PCRE2_ERROR_UNSET + PCRE2_INFO_EXTRAOPTIONS Extra options that were passed in the + compile context + PCRE2_INFO_FIRSTBITMAP Bitmap of first code units, or NULL + PCRE2_INFO_FIRSTCODETYPE Type of start-of-match information + 0 nothing set + 1 first code unit is set + 2 start of string or after newline + PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1 + PCRE2_INFO_FRAMESIZE Size of backtracking frame + PCRE2_INFO_HASBACKSLASHC Return 1 if pattern contains \C + PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist in the pattern + PCRE2_INFO_HEAPLIMIT Heap memory limit if set, otherwise PCRE2_ERROR_UNSET + PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used + PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0 + PCRE2_INFO_LASTCODETYPE Type of must-be-present information + 0 nothing set + 1 code unit is set + PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1 + PCRE2_INFO_MATCHEMPTY 1 if the pattern can match an empty string, 0 otherwise + PCRE2_INFO_MATCHLIMIT Match limit if set, otherwise PCRE2_ERROR_UNSET + PCRE2_INFO_MAXLOOKBEHIND Length (in characters) of the longest lookbehind assertion + PCRE2_INFO_MINLENGTH Lower bound length of matching strings + PCRE2_INFO_NAMECOUNT Number of named subpatterns + PCRE2_INFO_NAMEENTRYSIZE Size of name table entries + PCRE2_INFO_NAMETABLE Pointer to name table + PCRE2_CONFIG_NEWLINE Code for the newline sequence: + PCRE2_NEWLINE_CR + PCRE2_NEWLINE_LF + PCRE2_NEWLINE_CRLF + PCRE2_NEWLINE_ANY + PCRE2_NEWLINE_ANYCRLF + PCRE2_NEWLINE_NUL + PCRE2_INFO_RECURSIONLIMIT Obsolete synonym for PCRE2_INFO_DEPTHLIMIT + PCRE2_INFO_SIZE Size of compiled pattern ++If where is NULL, the function returns the amount of memory needed for +the requested information, in bytes. Otherwise, the where argument must +point to an unsigned 32-bit integer (uint32_t variable), except for the +following what values, when it must point to a variable of the type +shown: +
+ PCRE2_INFO_FIRSTBITMAP const uint8_t * + PCRE2_INFO_JITSIZE size_t + PCRE2_INFO_NAMETABLE PCRE2_SPTR + PCRE2_INFO_SIZE size_t ++The yield of the function is zero on success or: +
+ PCRE2_ERROR_NULL the argument code is NULL + PCRE2_ERROR_BADMAGIC the "magic number" was not found + PCRE2_ERROR_BADOPTION the value of what is invalid + PCRE2_ERROR_BADMODE the pattern was compiled in the wrong mode + PCRE2_ERROR_UNSET the requested information is not set ++ +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_serialize_decode.html b/doc/html/pcre2_serialize_decode.html new file mode 100644 index 0000000..618ffa9 --- /dev/null +++ b/doc/html/pcre2_serialize_decode.html @@ -0,0 +1,65 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int32_t pcre2_serialize_decode(pcre2_code **codes, + int32_t number_of_codes, const uint8_t *bytes, + pcre2_general_context *gcontext); +
++This function decodes a serialized set of compiled patterns back into a list of +individual patterns. This is possible only on a host that is running the same +version of PCRE2, with the same code unit width, and the host must also have +the same endianness, pointer width and PCRE2_SIZE type. The arguments for +pcre2_serialize_decode() are: +
+ codes pointer to a vector in which to build the list + number_of_codes number of slots in the vector + bytes the serialized byte stream + gcontext pointer to a general context or NULL ++The bytes argument must point to a block of data that was originally +created by pcre2_serialize_encode(), though it may have been saved on +disc or elsewhere in the meantime. If there are more codes in the serialized +data than slots in the list, only those compiled patterns that will fit are +decoded. The yield of the function is the number of decoded patterns, or one of +the following negative error codes: +
+ PCRE2_ERROR_BADDATA number_of_codes is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in bytes + PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version + PCRE2_ERROR_NOMEMORY memory allocation failed + PCRE2_ERROR_NULL codes or bytes is NULL ++PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled +on a system with different endianness. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the serialization functions in the +pcre2serialize +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_serialize_encode.html b/doc/html/pcre2_serialize_encode.html new file mode 100644 index 0000000..f153270 --- /dev/null +++ b/doc/html/pcre2_serialize_encode.html @@ -0,0 +1,66 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int32_t pcre2_serialize_encode(const pcre2_code **codes, + int32_t number_of_codes, uint8_t **serialized_bytes, + PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext); +
++This function encodes a list of compiled patterns into a byte stream that can +be saved on disc or elsewhere. Note that this is not an abstract format like +Java or .NET. Conversion of the byte stream back into usable compiled patterns +can only happen on a host that is running the same version of PCRE2, with the +same code unit width, and the host must also have the same endianness, pointer +width and PCRE2_SIZE type. The arguments for pcre2_serialize_encode() +are: +
+ codes pointer to a vector containing the list + number_of_codes number of slots in the vector + serialized_bytes set to point to the serialized byte stream + serialized_size set to the number of bytes in the byte stream + gcontext pointer to a general context or NULL ++The context argument is used to obtain memory for the byte stream. When the +serialized data is no longer needed, it must be freed by calling +pcre2_serialize_free(). The yield of the function is the number of +serialized patterns, or one of the following negative error codes: +
+ PCRE2_ERROR_BADDATA number_of_codes is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns + PCRE2_ERROR_MEMORY memory allocation failed + PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables + PCRE2_ERROR_NULL an argument other than gcontext is NULL ++PCRE2_ERROR_BADMAGIC means either that a pattern's code has been corrupted, or +that a slot in the vector does not point to a compiled pattern. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the serialization functions in the +pcre2serialize +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_serialize_free.html b/doc/html/pcre2_serialize_free.html new file mode 100644 index 0000000..26b435b --- /dev/null +++ b/doc/html/pcre2_serialize_free.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_serialize_free(uint8_t *bytes); +
++This function frees the memory that was obtained by +pcre2_serialize_encode() to hold a serialized byte stream. The argument +must point to such a byte stream or be NULL, in which case the function returns +without doing anything. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the serialization functions in the +pcre2serialize +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_serialize_get_number_of_codes.html b/doc/html/pcre2_serialize_get_number_of_codes.html new file mode 100644 index 0000000..fdd2429 --- /dev/null +++ b/doc/html/pcre2_serialize_get_number_of_codes.html @@ -0,0 +1,49 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes); +
++The bytes argument must point to a serialized byte stream that was +originally created by pcre2_serialize_encode() (though it may have been +saved on disc or elsewhere in the meantime). The function returns the number of +serialized patterns in the byte stream, or one of the following negative error +codes: +
+ PCRE2_ERROR_BADMAGIC mismatch of id bytes in bytes + PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version + PCRE2_ERROR_NULL the argument is NULL ++PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled +on a system with different endianness. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the serialization functions in the +pcre2serialize +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_bsr.html b/doc/html/pcre2_set_bsr.html new file mode 100644 index 0000000..8a62f18 --- /dev/null +++ b/doc/html/pcre2_set_bsr.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_bsr(pcre2_compile_context *ccontext, + uint32_t value); +
++This function sets the convention for processing \R within a compile context. +The second argument must be one of PCRE2_BSR_ANYCRLF or PCRE2_BSR_UNICODE. The +result is zero for success or PCRE2_ERROR_BADDATA if the second argument is +invalid. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_callout.html b/doc/html/pcre2_set_callout.html new file mode 100644 index 0000000..4e7aca6 --- /dev/null +++ b/doc/html/pcre2_set_callout.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_callout(pcre2_match_context *mcontext, + int (*callout_function)(pcre2_callout_block *), + void *callout_data); +
++This function sets the callout fields in a match context (the first argument). +The second argument specifies a callout function, and the third argument is an +opaque data item that is passed to it. The result of this function is always +zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_character_tables.html b/doc/html/pcre2_set_character_tables.html new file mode 100644 index 0000000..8564eea --- /dev/null +++ b/doc/html/pcre2_set_character_tables.html @@ -0,0 +1,45 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_character_tables(pcre2_compile_context *ccontext, + const uint8_t *tables); +
++This function sets a pointer to custom character tables within a compile +context. The second argument must point to a set of PCRE2 character tables or +be NULL to request the default tables. The result is always zero. Character +tables can be created by calling pcre2_maketables() or by running the +pcre2_dftables maintenance command in binary mode (see the +pcre2build +documentation). +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_compile_extra_options.html b/doc/html/pcre2_set_compile_extra_options.html new file mode 100644 index 0000000..4924ed7 --- /dev/null +++ b/doc/html/pcre2_set_compile_extra_options.html @@ -0,0 +1,54 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, + uint32_t extra_options); +
++This function sets additional option bits for pcre2_compile() that are +housed in a compile context. It completely replaces all the bits. The extra +options are: +
+ PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds
+ PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
+ PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
+ PCRE2_EXTRA_ASCII_BSD \d remains ASCII in UCP mode
+ PCRE2_EXTRA_ASCII_BSS \s remains ASCII in UCP mode
+ PCRE2_EXTRA_ASCII_BSW \w remains ASCII in UCP mode
+ PCRE2_EXTRA_ASCII_DIGIT [:digit:] and [:xdigit:] POSIX classes remain ASCII in UCP mode
+ PCRE2_EXTRA_ASCII_POSIX POSIX classes remain ASCII in UCP mode
+ PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
+ PCRE2_EXTRA_CASELESS_RESTRICT Disable mixed ASCII/non-ASCII case folding
+ PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
+ PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines
+ PCRE2_EXTRA_MATCH_WORD Pattern matches "words"
+
+There is a complete description of the PCRE2 native API in the
+pcre2api
+page and a description of the POSIX API in the
+pcre2posix
+page.
++Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_compile_recursion_guard.html b/doc/html/pcre2_set_compile_recursion_guard.html new file mode 100644 index 0000000..c09942c --- /dev/null +++ b/doc/html/pcre2_set_compile_recursion_guard.html @@ -0,0 +1,46 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, + int (*guard_function)(uint32_t, void *), void *user_data); +
++This function defines, within a compile context, a function that is called +whenever pcre2_compile() starts to compile a parenthesized part of a +pattern. The first argument to the function gives the current depth of +parenthesis nesting, and the second is user data that is supplied when the +function is set up. The callout function should return zero if all is well, or +non-zero to force an error. This feature is provided so that applications can +check the available system stack space, in order to avoid running out. The +result of pcre2_set_compile_recursion_guard() is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_depth_limit.html b/doc/html/pcre2_set_depth_limit.html new file mode 100644 index 0000000..a1cf706 --- /dev/null +++ b/doc/html/pcre2_set_depth_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_depth_limit(pcre2_match_context *mcontext, + uint32_t value); +
++This function sets the backtracking depth limit field in a match context. The +result is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_glob_escape.html b/doc/html/pcre2_set_glob_escape.html new file mode 100644 index 0000000..2b55627 --- /dev/null +++ b/doc/html/pcre2_set_glob_escape.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, + uint32_t escape_char); +
++This function is part of an experimental set of pattern conversion functions. +It sets the escape character that is used when converting globs. The second +argument must either be zero (meaning there is no escape character) or a +punctuation character whose code point is less than 256. The default is grave +accent if running under Windows, otherwise backslash. The result of the +function is zero for success or PCRE2_ERROR_BADDATA if the second argument is +invalid. +
++The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_glob_separator.html b/doc/html/pcre2_set_glob_separator.html new file mode 100644 index 0000000..283648e --- /dev/null +++ b/doc/html/pcre2_set_glob_separator.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, + uint32_t separator_char); +
++This function is part of an experimental set of pattern conversion functions. +It sets the component separator character that is used when converting globs. +The second argument must be one of the characters forward slash, backslash, or +dot. The default is backslash when running under Windows, otherwise forward +slash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if +the second argument is invalid. +
++The pattern conversion functions are described in the +pcre2convert +documentation. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_heap_limit.html b/doc/html/pcre2_set_heap_limit.html new file mode 100644 index 0000000..3631ef6 --- /dev/null +++ b/doc/html/pcre2_set_heap_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_heap_limit(pcre2_match_context *mcontext, + uint32_t value); +
++This function sets the backtracking heap limit field in a match context. The +result is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_match_limit.html b/doc/html/pcre2_set_match_limit.html new file mode 100644 index 0000000..e840c74 --- /dev/null +++ b/doc/html/pcre2_set_match_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_match_limit(pcre2_match_context *mcontext, + uint32_t value); +
++This function sets the match limit field in a match context. The result is +always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_max_pattern_compiled_length.html b/doc/html/pcre2_set_max_pattern_compiled_length.html new file mode 100644 index 0000000..ab570cf --- /dev/null +++ b/doc/html/pcre2_set_max_pattern_compiled_length.html @@ -0,0 +1,44 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_max_pattern_compiled_length( + pcre2_compile_context *ccontext, PCRE2_SIZE value); +
++This function sets, in a compile context, the maximum size (in bytes) for the +memory needed to hold the compiled version of a pattern that is compiled with +this context. The result is always zero. If a pattern that is passed to +pcre2_compile() with this context needs more memory, an error is +generated. The default is the largest number that a PCRE2_SIZE variable can +hold, which is effectively unlimited. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_max_pattern_length.html b/doc/html/pcre2_set_max_pattern_length.html new file mode 100644 index 0000000..f6e422a --- /dev/null +++ b/doc/html/pcre2_set_max_pattern_length.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext, + PCRE2_SIZE value); +
++This function sets, in a compile context, the maximum text length (in code +units) of the pattern that can be compiled. The result is always zero. If a +longer pattern is passed to pcre2_compile() there is an immediate error +return. The default is effectively unlimited, being the largest value a +PCRE2_SIZE variable can hold. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_max_varlookbehind.html b/doc/html/pcre2_set_max_varlookbehind.html new file mode 100644 index 0000000..1c03def --- /dev/null +++ b/doc/html/pcre2_set_max_varlookbehind.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_max_varlookbehind(pcre2_compile_context *ccontext, + uint32_t value); +
++This sets a maximum length for the number of characters matched by a +variable-length lookbehind assertion. The default is set when PCRE2 is built, +with the ultimate default being 255, the same as Perl. Lookbehind assertions +without a bounding length are not supported. The result is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_newline.html b/doc/html/pcre2_set_newline.html new file mode 100644 index 0000000..ba81300 --- /dev/null +++ b/doc/html/pcre2_set_newline.html @@ -0,0 +1,51 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_newline(pcre2_compile_context *ccontext, + uint32_t value); +
++This function sets the newline convention within a compile context. This +specifies which character(s) are recognized as newlines when compiling and +matching patterns. The second argument must be one of: +
+ PCRE2_NEWLINE_CR Carriage return only + PCRE2_NEWLINE_LF Linefeed only + PCRE2_NEWLINE_CRLF CR followed by LF only + PCRE2_NEWLINE_ANYCRLF Any of the above + PCRE2_NEWLINE_ANY Any Unicode newline sequence + PCRE2_NEWLINE_NUL The NUL character (binary zero) ++The result is zero for success or PCRE2_ERROR_BADDATA if the second argument is +invalid. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_offset_limit.html b/doc/html/pcre2_set_offset_limit.html new file mode 100644 index 0000000..6d9a85c --- /dev/null +++ b/doc/html/pcre2_set_offset_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_offset_limit(pcre2_match_context *mcontext, + PCRE2_SIZE value); +
++This function sets the offset limit field in a match context. The result is +always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_parens_nest_limit.html b/doc/html/pcre2_set_parens_nest_limit.html new file mode 100644 index 0000000..95fd31c --- /dev/null +++ b/doc/html/pcre2_set_parens_nest_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, + uint32_t value); +
++This function sets, in a compile context, the maximum depth of nested +parentheses in a pattern. The result is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_recursion_limit.html b/doc/html/pcre2_set_recursion_limit.html new file mode 100644 index 0000000..9ff68c2 --- /dev/null +++ b/doc/html/pcre2_set_recursion_limit.html @@ -0,0 +1,40 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_recursion_limit(pcre2_match_context *mcontext, + uint32_t value); +
++This function is obsolete and should not be used in new code. Use +pcre2_set_depth_limit() instead. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_recursion_memory_management.html b/doc/html/pcre2_set_recursion_memory_management.html new file mode 100644 index 0000000..37af73c --- /dev/null +++ b/doc/html/pcre2_set_recursion_memory_management.html @@ -0,0 +1,42 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_recursion_memory_management( + pcre2_match_context *mcontext, + void *(*private_malloc)(size_t, void *), + void (*private_free)(void *, void *), void *memory_data); +
++From release 10.30 onwards, this function is obsolete and does nothing. The +result is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_substitute_callout.html b/doc/html/pcre2_set_substitute_callout.html new file mode 100644 index 0000000..7ae3a39 --- /dev/null +++ b/doc/html/pcre2_set_substitute_callout.html @@ -0,0 +1,43 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_substitute_callout(pcre2_match_context *mcontext, + int (*callout_function)(pcre2_substitute_callout_block *), + void *callout_data); +
++This function sets the substitute callout fields in a match context (the first +argument). The second argument specifies a callout function, and the third +argument is an opaque data item that is passed to it. The result of this +function is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substitute.html b/doc/html/pcre2_substitute.html new file mode 100644 index 0000000..abf0a70 --- /dev/null +++ b/doc/html/pcre2_substitute.html @@ -0,0 +1,111 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, PCRE2_SPTR replacement, + PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, + PCRE2_SIZE *outlengthptr); +
++This function matches a compiled regular expression against a given subject +string, using a matching algorithm that is similar to Perl's. It then makes a +copy of the subject, substituting a replacement string for what was matched. +Its arguments are: +
+ code Points to the compiled pattern + subject Points to the subject string + length Length of the subject string + startoffset Offset in the subject at which to start matching + options Option bits + match_data Points to a match data block, or is NULL + mcontext Points to a match context, or is NULL + replacement Points to the replacement string + rlength Length of the replacement string + outputbuffer Points to the output buffer + outlengthptr Points to the length of the output buffer ++A match data block is needed only if you want to inspect the data from the +final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is +set. A match context is needed only if you want to: +
+ Set up a callout function + Set a matching offset limit + Change the backtracking match limit + Change the backtracking depth limit + Set custom memory management in the match context ++The length, startoffset and rlength values are code units, +not characters, as is the contents of the variable pointed at by +outlengthptr. This variable must contain the length of the output buffer +when the function is called. If the function is successful, the value is +changed to the length of the new string, excluding the trailing zero that is +automatically added. + +
+The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for +zero-terminated strings. The options are: +
+ PCRE2_ANCHORED Match only at the first position + PCRE2_ENDANCHORED Match only at end of subject + PCRE2_NOTBOL Subject is not the beginning of a line + PCRE2_NOTEOL Subject is not the end of a line + PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match + PCRE2_NO_JIT Do not use JIT matching + PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement + (only relevant if PCRE2_UTF was set at compile time) + PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing + PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject + PCRE2_SUBSTITUTE_LITERAL The replacement string is literal + PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length + PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s) + PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset + PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string ++If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED, +PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored. + +
+If PCRE2_SUBSTITUTE_MATCHED is set, match_data must be non-NULL; its +contents must be the result of a call to pcre2_match() using the same +pattern and subject. +
++The function returns the number of substitutions, which may be zero if there +are no matches. The result may be greater than one only when +PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code +is returned. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_copy_byname.html b/doc/html/pcre2_substring_copy_byname.html new file mode 100644 index 0000000..fd01805 --- /dev/null +++ b/doc/html/pcre2_substring_copy_byname.html @@ -0,0 +1,58 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_copy_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen); +
++This is a convenience function for extracting a captured substring, identified +by name, into a given buffer. The arguments are: +
+ match_data The match data block for the match + name Name of the required substring + buffer Buffer to receive the string + bufflen Length of buffer (code units) ++The bufflen variable is updated to contain the length of the extracted +string, excluding the trailing zero. The yield of the function is zero for +success or one of the following error numbers: +
+ PCRE2_ERROR_NOSUBSTRING there are no groups of that name + PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group + PCRE2_ERROR_UNSET the group did not participate in the match + PCRE2_ERROR_NOMEMORY the buffer is not big enough ++If there is more than one group with the given name, the first one that is set +is returned. In this situation PCRE2_ERROR_UNSET means that no group with the +given name was set. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_copy_bynumber.html b/doc/html/pcre2_substring_copy_bynumber.html new file mode 100644 index 0000000..83e1a27 --- /dev/null +++ b/doc/html/pcre2_substring_copy_bynumber.html @@ -0,0 +1,57 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_copy_bynumber(pcre2_match_data *match_data, + uint32_t number, PCRE2_UCHAR *buffer, + PCRE2_SIZE *bufflen); +
++This is a convenience function for extracting a captured substring into a given +buffer. The arguments are: +
+ match_data The match data block for the match + number Number of the required substring + buffer Buffer to receive the string + bufflen Length of buffer ++The bufflen variable is updated with the length of the extracted string, +excluding the terminating zero. The yield of the function is zero for success +or one of the following error numbers: +
+ PCRE2_ERROR_NOSUBSTRING there are no groups of that number + PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group + PCRE2_ERROR_UNSET the group did not participate in the match + PCRE2_ERROR_NOMEMORY the buffer is too small + ++ +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_free.html b/doc/html/pcre2_substring_free.html new file mode 100644 index 0000000..e0d0fbd --- /dev/null +++ b/doc/html/pcre2_substring_free.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_substring_free(PCRE2_UCHAR *buffer); +
++This is a convenience function for freeing the memory obtained by a previous +call to pcre2_substring_get_byname() or +pcre2_substring_get_bynumber(). Its only argument is a pointer to the +string. If the argument is NULL, the function does nothing. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_get_byname.html b/doc/html/pcre2_substring_get_byname.html new file mode 100644 index 0000000..a4b8771 --- /dev/null +++ b/doc/html/pcre2_substring_get_byname.html @@ -0,0 +1,60 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_get_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); +
++This is a convenience function for extracting a captured substring by name into +newly acquired memory. The arguments are: +
+ match_data The match data for the match + name Name of the required substring + bufferptr Where to put the string pointer + bufflen Where to put the string length ++The memory in which the substring is placed is obtained by calling the same +memory allocation function that was used for the match data block. The +convenience function pcre2_substring_free() can be used to free it when +it is no longer needed. The yield of the function is zero for success or one of +the following error numbers: +
+ PCRE2_ERROR_NOSUBSTRING there are no groups of that name + PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group + PCRE2_ERROR_UNSET the group did not participate in the match + PCRE2_ERROR_NOMEMORY memory could not be obtained ++If there is more than one group with the given name, the first one that is set +is returned. In this situation PCRE2_ERROR_UNSET means that no group with the +given name was set. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_get_bynumber.html b/doc/html/pcre2_substring_get_bynumber.html new file mode 100644 index 0000000..391bc82 --- /dev/null +++ b/doc/html/pcre2_substring_get_bynumber.html @@ -0,0 +1,58 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_get_bynumber(pcre2_match_data *match_data, + uint32_t number, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen); +
++This is a convenience function for extracting a captured substring by number +into newly acquired memory. The arguments are: +
+ match_data The match data for the match + number Number of the required substring + bufferptr Where to put the string pointer + bufflen Where to put the string length ++The memory in which the substring is placed is obtained by calling the same +memory allocation function that was used for the match data block. The +convenience function pcre2_substring_free() can be used to free it when +it is no longer needed. The yield of the function is zero for success or one of +the following error numbers: +
+ PCRE2_ERROR_NOSUBSTRING there are no groups of that number + PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group + PCRE2_ERROR_UNSET the group did not participate in the match + PCRE2_ERROR_NOMEMORY memory could not be obtained + ++ +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_length_byname.html b/doc/html/pcre2_substring_length_byname.html new file mode 100644 index 0000000..213bc94 --- /dev/null +++ b/doc/html/pcre2_substring_length_byname.html @@ -0,0 +1,46 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_length_byname(pcre2_match_data *match_data, + PCRE2_SPTR name, PCRE2_SIZE *length); +
++This function returns the length of a matched substring, identified by name. +The arguments are: +
+ match_data The match data block for the match + name The substring name + length Where to return the length ++The yield is zero on success, or an error code if the substring is not found. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_length_bynumber.html b/doc/html/pcre2_substring_length_bynumber.html new file mode 100644 index 0000000..db01cca --- /dev/null +++ b/doc/html/pcre2_substring_length_bynumber.html @@ -0,0 +1,48 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_length_bynumber(pcre2_match_data *match_data, + uint32_t number, PCRE2_SIZE *length); +
++This function returns the length of a matched substring, identified by number. +The arguments are: +
+ match_data The match data block for the match + number The substring number + length Where to return the length, or NULL ++The third argument may be NULL if all you want to know is whether or not a +substring is set. The yield is zero on success, or a negative error code +otherwise. After a partial match, only substring 0 is available. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_list_free.html b/doc/html/pcre2_substring_list_free.html new file mode 100644 index 0000000..dea8bc5 --- /dev/null +++ b/doc/html/pcre2_substring_list_free.html @@ -0,0 +1,41 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++void pcre2_substring_list_free(PCRE2_UCHAR **list); +
++This is a convenience function for freeing the store obtained by a previous +call to pcre2substring_list_get(). Its only argument is a pointer to +the list of string pointers. If the argument is NULL, the function returns +immediately, without doing anything. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_list_get.html b/doc/html/pcre2_substring_list_get.html new file mode 100644 index 0000000..fd43627 --- /dev/null +++ b/doc/html/pcre2_substring_list_get.html @@ -0,0 +1,56 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_list_get(pcre2_match_data *match_data, +" PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); +
++This is a convenience function for extracting all the captured substrings after +a pattern match. It builds a list of pointers to the strings, and (optionally) +a second list that contains their lengths (in code units), excluding a +terminating zero that is added to each of them. All this is done in a single +block of memory that is obtained using the same memory allocation function that +was used to get the match data block. The convenience function +pcre2_substring_list_free() can be used to free it when it is no longer +needed. The arguments are: +
+ match_data The match data block + listptr Where to put a pointer to the list + lengthsptr Where to put a pointer to the lengths, or NULL ++A pointer to a list of pointers is put in the variable whose address is in +listptr. The list is terminated by a NULL pointer. If lengthsptr is +not NULL, a matching list of lengths is created, and its address is placed in +lengthsptr. The yield of the function is zero on success or +PCRE2_ERROR_NOMEMORY if sufficient memory could not be obtained. + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_nametable_scan.html b/doc/html/pcre2_substring_nametable_scan.html new file mode 100644 index 0000000..277affa --- /dev/null +++ b/doc/html/pcre2_substring_nametable_scan.html @@ -0,0 +1,53 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_nametable_scan(const pcre2_code *code, + PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); +
++This convenience function finds, for a compiled pattern, the first and last +entries for a given name in the table that translates capture group names into +numbers. +
+ code Compiled regular expression + name Name whose entries required + first Where to return a pointer to the first entry + last Where to return a pointer to the last entry ++When the name is found in the table, if first is NULL, the function +returns a group number, but if there is more than one matching entry, it is not +defined which one. Otherwise, when both pointers have been set, the yield of +the function is the length of each entry in code units. If the name is not +found, PCRE2_ERROR_NOSUBSTRING is returned. + +
+There is a complete description of the PCRE2 native API, including the format of +the table entries, in the +pcre2api +page, and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_substring_number_from_name.html b/doc/html/pcre2_substring_number_from_name.html new file mode 100644 index 0000000..160fbda --- /dev/null +++ b/doc/html/pcre2_substring_number_from_name.html @@ -0,0 +1,50 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_substring_number_from_name(const pcre2_code *code, + PCRE2_SPTR name); +
++This convenience function finds the number of a named substring capturing +parenthesis in a compiled pattern, provided that it is a unique name. The +function arguments are: +
+ code Compiled regular expression + name Name whose number is required ++The yield of the function is the number of the parenthesis if the name is +found, or PCRE2_ERROR_NOSUBSTRING if it is not found. When duplicate names are +allowed (PCRE2_DUPNAMES is set), if the name is not unique, +PCRE2_ERROR_NOUNIQUESUBSTRING is returned. You can obtain the list of numbers +with the same name by calling pcre2_substring_nametable_scan(). + +
+There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html new file mode 100644 index 0000000..6b60ee9 --- /dev/null +++ b/doc/html/pcre2api.html @@ -0,0 +1,4186 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+#include <pcre2.h>
+
+
+PCRE2 is a new API for PCRE, starting at release 10.0. This document contains a
+description of all its native functions. See the
+pcre2
+document for an overview of all the PCRE2 documentation.
+
+pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
+ uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
+ pcre2_compile_context *ccontext);
+
+
+void pcre2_code_free(pcre2_code *code);
+
+
+pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
+ pcre2_general_context *gcontext);
+
+
+pcre2_match_data *pcre2_match_data_create_from_pattern(
+ const pcre2_code *code, pcre2_general_context *gcontext);
+
+
+int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext);
+
+
+int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext,
+ int *workspace, PCRE2_SIZE wscount);
+
+
+void pcre2_match_data_free(pcre2_match_data *match_data);
+
+PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE pcre2_get_match_data_heapframes_size(
+ pcre2_match_data *match_data);
+
+
+uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
+
+pcre2_general_context *pcre2_general_context_create(
+ void *(*private_malloc)(PCRE2_SIZE, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+
+pcre2_general_context *pcre2_general_context_copy(
+ pcre2_general_context *gcontext);
+
+
+void pcre2_general_context_free(pcre2_general_context *gcontext);
+
+pcre2_compile_context *pcre2_compile_context_create(
+ pcre2_general_context *gcontext);
+
+
+pcre2_compile_context *pcre2_compile_context_copy(
+ pcre2_compile_context *ccontext);
+
+
+void pcre2_compile_context_free(pcre2_compile_context *ccontext);
+
+
+int pcre2_set_bsr(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+
+int pcre2_set_character_tables(pcre2_compile_context *ccontext,
+ const uint8_t *tables);
+
+
+int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
+ uint32_t extra_options);
+
+
+int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
+ PCRE2_SIZE value);
+
+
+int pcre2_set_max_pattern_compiled_length(
+ pcre2_compile_context *ccontext, PCRE2_SIZE value);
+
+
+int pcre2_set_max_varlookbehind(pcre2_compile_contest *ccontext,
+" uint32_t value);
+
+
+int pcre2_set_newline(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+
+int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+
+int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
+ int (*guard_function)(uint32_t, void *), void *user_data);
+
+pcre2_match_context *pcre2_match_context_create(
+ pcre2_general_context *gcontext);
+
+
+pcre2_match_context *pcre2_match_context_copy(
+ pcre2_match_context *mcontext);
+
+
+void pcre2_match_context_free(pcre2_match_context *mcontext);
+
+
+int pcre2_set_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_callout_block *, void *),
+ void *callout_data);
+
+
+int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_substitute_callout_block *, void *),
+ void *callout_data);
+
+
+int pcre2_set_offset_limit(pcre2_match_context *mcontext,
+ PCRE2_SIZE value);
+
+
+int pcre2_set_heap_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+
+int pcre2_set_match_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+
+int pcre2_set_depth_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+int pcre2_substring_copy_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
+
+
+int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
+ uint32_t number, PCRE2_UCHAR *buffer,
+ PCRE2_SIZE *bufflen);
+
+
+void pcre2_substring_free(PCRE2_UCHAR *buffer);
+
+
+int pcre2_substring_get_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
+
+
+int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
+ uint32_t number, PCRE2_UCHAR **bufferptr,
+ PCRE2_SIZE *bufflen);
+
+
+int pcre2_substring_length_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_SIZE *length);
+
+
+int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
+ uint32_t number, PCRE2_SIZE *length);
+
+
+int pcre2_substring_nametable_scan(const pcre2_code *code,
+ PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
+
+
+int pcre2_substring_number_from_name(const pcre2_code *code,
+ PCRE2_SPTR name);
+
+
+void pcre2_substring_list_free(PCRE2_UCHAR **list);
+
+
+int pcre2_substring_list_get(pcre2_match_data *match_data,
+" PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
+
+int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, PCRE2_SPTR replacementz, + PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, + PCRE2_SIZE *outlengthptr); +
+
+int pcre2_jit_compile(pcre2_code *code, uint32_t options);
+
+
+int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext);
+
+
+void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
+
+
+pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize,
+ size_t maxsize, pcre2_general_context *gcontext);
+
+
+void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
+ pcre2_jit_callback callback_function, void *callback_data);
+
+
+void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
+
+int32_t pcre2_serialize_decode(pcre2_code **codes,
+ int32_t number_of_codes, const uint8_t *bytes,
+ pcre2_general_context *gcontext);
+
+
+int32_t pcre2_serialize_encode(const pcre2_code **codes,
+ int32_t number_of_codes, uint8_t **serialized_bytes,
+ PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
+
+
+void pcre2_serialize_free(uint8_t *bytes);
+
+
+int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
+
+pcre2_code *pcre2_code_copy(const pcre2_code *code);
+
+
+pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
+
+
+int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
+ PCRE2_SIZE bufflen);
+
+
+const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);
+
+
+void pcre2_maketables_free(pcre2_general_context *gcontext,
+ const uint8_t *tables);
+
+
+int pcre2_pattern_info(const pcre2_code *code, uint32_t what,
+ void *where);
+
+
+int pcre2_callout_enumerate(const pcre2_code *code,
+ int (*callback)(pcre2_callout_enumerate_block *, void *),
+ void *user_data);
+
+
+int pcre2_config(uint32_t what, void *where);
+
+int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+
+int pcre2_set_recursion_memory_management(
+ pcre2_match_context *mcontext,
+ void *(*private_malloc)(size_t, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+
+These functions became obsolete at release 10.30 and are retained only for
+backward compatibility. They should not be used in new code. The first is
+replaced by pcre2_set_depth_limit(); the second is no longer needed and
+has no effect (it always returns zero).
+
+pcre2_convert_context *pcre2_convert_context_create(
+ pcre2_general_context *gcontext);
+
+
+pcre2_convert_context *pcre2_convert_context_copy(
+ pcre2_convert_context *cvcontext);
+
+
+void pcre2_convert_context_free(pcre2_convert_context *cvcontext);
+
+
+int pcre2_set_glob_escape(pcre2_convert_context *cvcontext,
+ uint32_t escape_char);
+
+
+int pcre2_set_glob_separator(pcre2_convert_context *cvcontext,
+ uint32_t separator_char);
+
+
+int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length,
+ uint32_t options, PCRE2_UCHAR **buffer,
+ PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);
+
+
+void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);
+
+
+These functions provide a way of converting non-PCRE2 patterns into
+patterns that can be processed by pcre2_compile(). This facility is
+experimental and may be changed in future releases. At present, "globs" and
+POSIX basic and extended patterns can be converted. Details are given in the
+pcre2convert
+documentation.
+
+There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code +units, respectively. However, there is just one header file, pcre2.h. +This contains the function prototypes and other definitions for all three +libraries. One, two, or all three can be installed simultaneously. On Unix-like +systems the libraries are called libpcre2-8, libpcre2-16, and +libpcre2-32, and they can also co-exist with the original PCRE libraries. +Every PCRE2 function comes in three different forms, one for each library, for +example: +
+ pcre2_compile_8() + pcre2_compile_16() + pcre2_compile_32() ++There are also three different sets of data types: +
+ PCRE2_UCHAR8, PCRE2_UCHAR16, PCRE2_UCHAR32 + PCRE2_SPTR8, PCRE2_SPTR16, PCRE2_SPTR32 ++The UCHAR types define unsigned code units of the appropriate widths. +For example, PCRE2_UCHAR16 is usually defined as `uint16_t'. +The SPTR types are pointers to constants of the equivalent UCHAR types, +that is, they are pointers to vectors of unsigned code units. + +
+Character strings are passed to a PCRE2 library as sequences of unsigned +integers in code units of the appropriate width. The length of a string may +be given as a number of code units, or the string may be specified as +zero-terminated. +
++Many applications use only one code unit width. For their convenience, macros +are defined whose names are the generic forms such as pcre2_compile() and +PCRE2_SPTR. These macros use the value of the macro PCRE2_CODE_UNIT_WIDTH to +generate the appropriate width-specific function and macro names. +PCRE2_CODE_UNIT_WIDTH is not defined by default. An application must define it +to be 8, 16, or 32 before including pcre2.h in order to make use of the +generic names. +
++Applications that use more than one code unit width can be linked with more +than one PCRE2 library, but must define PCRE2_CODE_UNIT_WIDTH to be 0 before +including pcre2.h, and then use the real function names. Any code that is +to be included in an environment where the value of PCRE2_CODE_UNIT_WIDTH is +unknown should also use the real function names. (Unfortunately, it is not +possible in C code to save and restore the value of a macro.) +
++If PCRE2_CODE_UNIT_WIDTH is not defined before including pcre2.h, a +compiler error occurs. +
++When using multiple libraries in an application, you must take care when +processing any particular pattern to use only functions from a single library. +For example, if you want to run a match using a pattern that was compiled with +pcre2_compile_16(), you must do so with pcre2_match_16(), not +pcre2_match_8() or pcre2_match_32(). +
++In the function summaries above, and in the rest of this document and other +PCRE2 documents, functions and data types are described using their generic +names, without the _8, _16, or _32 suffix. +
++PCRE2 has its own native API, which is described in this document. There are +also some wrapper functions for the 8-bit library that correspond to the +POSIX regular expression API, but they do not give access to all the +functionality of PCRE2 and they are not thread-safe. They are described in the +pcre2posix +documentation. Both these APIs define a set of C function calls. +
++The native API C data types, function prototypes, option values, and error +codes are defined in the header file pcre2.h, which also contains +definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers +for the library. Applications can use these to include support for different +releases of PCRE2. +
++In a Windows environment, if you want to statically link an application program +against a non-dll PCRE2 library, you must define PCRE2_STATIC before including +pcre2.h. +
++The functions pcre2_compile() and pcre2_match() are used for +compiling and matching regular expressions in a Perl-compatible manner. A +sample program that demonstrates the simplest way of using them is provided in +the file called pcre2demo.c in the PCRE2 source distribution. A listing +of this program is given in the +pcre2demo +documentation, and the +pcre2sample +documentation describes how to compile and run it. +
++The compiling and matching functions recognize various options that are passed +as bits in an options argument. There are also some more complicated parameters +such as custom memory management functions and resource limits that are passed +in "contexts" (which are just memory blocks, described below). Simple +applications do not need to make use of contexts. +
++Just-in-time (JIT) compiler support is an optional feature of PCRE2 that can be +built in appropriate hardware environments. It greatly speeds up the matching +performance of many patterns. Programs can request that it be used if +available by calling pcre2_jit_compile() after a pattern has been +successfully compiled by pcre2_compile(). This does nothing if JIT +support is not available. +
++More complicated programs might need to make use of the specialist functions +pcre2_jit_stack_create(), pcre2_jit_stack_free(), and +pcre2_jit_stack_assign() in order to control the JIT code's memory usage. +
++JIT matching is automatically used by pcre2_match() if it is available, +unless the PCRE2_NO_JIT option is set. There is also a direct interface for JIT +matching, which gives improved performance at the expense of less sanity +checking. The JIT-specific functions are discussed in the +pcre2jit +documentation. +
++A second matching function, pcre2_dfa_match(), which is not +Perl-compatible, is also provided. This uses a different algorithm for the +matching. The alternative algorithm finds all possible matches (at a given +point in the subject), and scans the subject just once (unless there are +lookaround assertions). However, this algorithm does not return captured +substrings. A description of the two matching algorithms and their advantages +and disadvantages is given in the +pcre2matching +documentation. There is no JIT support for pcre2_dfa_match(). +
++In addition to the main compiling and matching functions, there are convenience +functions for extracting captured substrings from a subject string that has +been matched by pcre2_match(). They are: +
+ pcre2_substring_copy_byname() + pcre2_substring_copy_bynumber() + pcre2_substring_get_byname() + pcre2_substring_get_bynumber() + pcre2_substring_list_get() + pcre2_substring_length_byname() + pcre2_substring_length_bynumber() + pcre2_substring_nametable_scan() + pcre2_substring_number_from_name() ++pcre2_substring_free() and pcre2_substring_list_free() are also +provided, to free memory used for extracted strings. If either of these +functions is called with a NULL argument, the function returns immediately +without doing anything. + +
+The function pcre2_substitute() can be called to match a pattern and +return a copy of the subject string with substitutions for parts that were +matched. +
++Functions whose names begin with pcre2_serialize_ are used for saving +compiled patterns on disc or elsewhere, and reloading them later. +
++Finally, there are functions for finding out information about a compiled +pattern (pcre2_pattern_info()) and about the configuration with which +PCRE2 was built (pcre2_config()). +
++Functions with names ending with _free() are used for freeing memory +blocks of various sorts. In all cases, if one of these functions is called with +a NULL argument, it does nothing. +
++The PCRE2 API uses string lengths and offsets into strings of code units in +several places. These values are always of type PCRE2_SIZE, which is an +unsigned integer type, currently always defined as size_t. The largest +value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved +as a special indicator for zero-terminated strings and unset offsets. +Therefore, the longest string that can be handled is one less than this +maximum. Note that string lengths are always given in code units. Only in the +8-bit library is such a length the same as the number of bytes in the string. +
++PCRE2 supports five different conventions for indicating line breaks in +strings: a single CR (carriage return) character, a single LF (linefeed) +character, the two-character sequence CRLF, any of the three preceding, or any +Unicode newline sequence. The Unicode newline sequences are the three just +mentioned, plus the single characters VT (vertical tab, U+000B), FF (form feed, +U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS +(paragraph separator, U+2029). +
++Each of the first three conventions is used by at least one operating system as +its standard newline sequence. When PCRE2 is built, a default can be specified. +If it is not, the default is set to LF, which is the Unix standard. However, +the newline convention can be changed by an application when calling +pcre2_compile(), or it can be specified by special text at the start of +the pattern itself; this overrides any other settings. See the +pcre2pattern +page for details of the special character sequences. +
++In the PCRE2 documentation the word "newline" is used to mean "the character or +pair of characters that indicate a line break". The choice of newline +convention affects the handling of the dot, circumflex, and dollar +metacharacters, the handling of #-comments in /x mode, and, when CRLF is a +recognized line ending sequence, the match position advancement for a +non-anchored pattern. There is more detail about this in the +section on pcre2_match() options +below. +
++The choice of newline convention does not affect the interpretation of +the \n or \r escape sequences, nor does it affect what \R matches; this has +its own separate convention. +
++In a multithreaded application it is important to keep thread-specific data +separate from data that can be shared between threads. The PCRE2 library code +itself is thread-safe: it contains no static or global variables. The API is +designed to be fairly simple for non-threaded applications while at the same +time ensuring that multithreaded applications can use it. +
++There are several different blocks of data that are used to pass information +between the application and the PCRE2 libraries. +
++A pointer to the compiled form of a pattern is returned to the user when +pcre2_compile() is successful. The data in the compiled pattern is fixed, +and does not change when the pattern is matched. Therefore, it is thread-safe, +that is, the same compiled pattern can be used by more than one thread +simultaneously. For example, an application can compile all its patterns at the +start, before forking off multiple threads that use them. However, if the +just-in-time (JIT) optimization feature is being used, it needs separate memory +stack areas for each thread. See the +pcre2jit +documentation for more details. +
++In a more complicated situation, where patterns are compiled only when they are +first needed, but are still shared between threads, pointers to compiled +patterns must be protected from simultaneous writing by multiple threads. This +is somewhat tricky to do correctly. If you know that writing to a pointer is +atomic in your environment, you can use logic like this: +
+ Get a read-only (shared) lock (mutex) for pointer
+ if (pointer == NULL)
+ {
+ Get a write (unique) lock for pointer
+ if (pointer == NULL) pointer = pcre2_compile(...
+ }
+ Release the lock
+ Use pointer in pcre2_match()
+
+Of course, testing for compilation errors should also be included in the code.
+
++The reason for checking the pointer a second time is as follows: Several +threads may have acquired the shared lock and tested the pointer for being +NULL, but only one of them will be given the write lock, with the rest kept +waiting. The winning thread will compile the pattern and store the result. +After this thread releases the write lock, another thread will get it, and if +it does not retest pointer for being NULL, will recompile the pattern and +overwrite the pointer, creating a memory leak and possibly causing other +issues. +
++In an environment where writing to a pointer may not be atomic, the above logic +is not sufficient. The thread that is doing the compiling may be descheduled +after writing only part of the pointer, which could cause other threads to use +an invalid value. Instead of checking the pointer itself, a separate "pointer +is valid" flag (that can be updated atomically) must be used: +
+ Get a read-only (shared) lock (mutex) for pointer
+ if (!pointer_is_valid)
+ {
+ Get a write (unique) lock for pointer
+ if (!pointer_is_valid)
+ {
+ pointer = pcre2_compile(...
+ pointer_is_valid = TRUE
+ }
+ }
+ Release the lock
+ Use pointer in pcre2_match()
+
+If JIT is being used, but the JIT compilation is not being done immediately
+(perhaps waiting to see if the pattern is used often enough), similar logic is
+required. JIT compilation updates a value within the compiled code block, so a
+thread must gain unique write access to the pointer before calling
+pcre2_jit_compile(). Alternatively, pcre2_code_copy() or
+pcre2_code_copy_with_tables() can be used to obtain a private copy of the
+compiled code before calling the JIT compiler.
+
++The next main section below introduces the idea of "contexts" in which PCRE2 +functions are called. A context is nothing more than a collection of parameters +that control the way PCRE2 operates. Grouping a number of parameters together +in a context is a convenient way of passing them to a PCRE2 function without +using lots of arguments. The parameters that are stored in contexts are in some +sense "advanced features" of the API. Many straightforward applications will +not need to use contexts. +
++In a multithreaded application, if the parameters in a context are values that +are never changed, the same context can be used by all the threads. However, if +any thread needs to change any value in a context, it must make its own +thread-specific copy. +
++The matching functions need a block of memory for storing the results of a +match. This includes details of what was matched, as well as additional +information such as the name of a (*MARK) setting. Each thread must provide its +own copy of this memory. +
++Some PCRE2 functions have a lot of parameters, many of which are used only by +specialist applications, for example, those that use custom memory management +or non-standard character tables. To keep function argument lists at a +reasonable size, and at the same time to keep the API extensible, "uncommon" +parameters are passed to certain functions in a context instead of +directly. A context is just a block of memory that holds the parameter values. +Applications that do not need to adjust any of the context parameters can pass +NULL when a context pointer is required. +
++There are three different types of context: a general context that is relevant +for several PCRE2 operations, a compile-time context, and a match-time context. +
+
+At present, this context just contains pointers to (and data for) external
+memory management functions that are called from several places in the PCRE2
+library. The context is named `general' rather than specifically `memory'
+because in future other fields may be added. If you do not want to supply your
+own custom memory management functions, you do not need to bother with a
+general context. A general context is created by:
+
+
+pcre2_general_context *pcre2_general_context_create(
+ void *(*private_malloc)(PCRE2_SIZE, void *),
+ void (*private_free)(void *, void *), void *memory_data);
+
+
+The two function pointers specify custom memory management functions, whose
+prototypes are:
+
+ void *private_malloc(PCRE2_SIZE, void *); + void private_free(void *, void *); ++Whenever code in PCRE2 calls these functions, the final argument is the value +of memory_data. Either of the first two arguments of the creation +function may be NULL, in which case the system memory management functions +malloc() and free() are used. (This is not currently useful, as +there are no other fields in a general context, but in future there might be.) +The private_malloc() function is used (if supplied) to obtain memory for +storing the context, and all three values are saved as part of the context. + +
+Whenever PCRE2 creates a data block of any kind, the block contains a pointer +to the free() function that matches the malloc() function that was +used. When the time comes to free the block, this function is called. +
+
+A general context can be copied by calling:
+
+
+pcre2_general_context *pcre2_general_context_copy(
+ pcre2_general_context *gcontext);
+
+
+The memory used for a general context should be freed by calling:
+
+
+void pcre2_general_context_free(pcre2_general_context *gcontext);
+
+
+If this function is passed a NULL argument, it returns immediately without
+doing anything.
+
+A compile context is required if you want to provide an external function for +stack checking during compilation or to change the default values of any of the +following compile-time parameters: +
+ What \R matches (Unicode newlines or CR, LF, CRLF only) + PCRE2's character tables + The newline character sequence + The compile time nested parentheses limit + The maximum length of the pattern string + The extra options bits (none set by default) ++A compile context is also required if you are using custom memory management. +If none of these apply, just pass NULL as the context argument of +pcre2_compile(). + +
+A compile context is created, copied, and freed by the following functions:
+
+
+pcre2_compile_context *pcre2_compile_context_create(
+ pcre2_general_context *gcontext);
+
+
+pcre2_compile_context *pcre2_compile_context_copy(
+ pcre2_compile_context *ccontext);
+
+
+void pcre2_compile_context_free(pcre2_compile_context *ccontext);
+
+
+A compile context is created with default values for its parameters. These can
+be changed by calling the following functions, which return 0 on success, or
+PCRE2_ERROR_BADDATA if invalid data is detected.
+
+
+int pcre2_set_bsr(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+
+The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF,
+or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
+ending sequence. The value is used by the JIT compiler and by the two
+interpreted matching functions, pcre2_match() and
+pcre2_dfa_match().
+
+
+int pcre2_set_character_tables(pcre2_compile_context *ccontext,
+ const uint8_t *tables);
+
+
+The value must be the result of a call to pcre2_maketables(), whose only
+argument is a general context. This function builds a set of character tables
+in the current locale.
+
+
+int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
+ uint32_t extra_options);
+
+
+As PCRE2 has developed, almost all the 32 option bits that are available in
+the options argument of pcre2_compile() have been used up. To avoid
+running out, the compile context contains a set of extra option bits which are
+used for some newer, assumed rarer, options. This function sets those bits. It
+always sets all the bits (either on or off). It does not modify any existing
+setting. The available options are defined in the section entitled "Extra
+compile options"
+below.
+
+
+int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
+ PCRE2_SIZE value);
+
+
+This sets a maximum length, in code units, for any pattern string that is
+compiled with this context. If the pattern is longer, an error is generated.
+This facility is provided so that applications that accept patterns from
+external sources can limit their size. The default is the largest number that a
+PCRE2_SIZE variable can hold, which is effectively unlimited.
+
+
+int pcre2_set_max_pattern_compiled_length(
+ pcre2_compile_context *ccontext, PCRE2_SIZE value);
+
+
+This sets a maximum size, in bytes, for the memory needed to hold the compiled
+version of a pattern that is compiled with this context. If the pattern needs
+more memory, an error is generated. This facility is provided so that
+applications that accept patterns from external sources can limit the amount of
+memory they use. The default is the largest number that a PCRE2_SIZE variable
+can hold, which is effectively unlimited.
+
+
+int pcre2_set_max_varlookbehind(pcre2_compile_contest *ccontext,
+" uint32_t value);
+
+
+This sets a maximum length for the number of characters matched by a
+variable-length lookbehind assertion. The default is set when PCRE2 is built,
+with the ultimate default being 255, the same as Perl. Lookbehind assertions
+without a bounding length are not supported.
+
+
+int pcre2_set_newline(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+
+This specifies which characters or character sequences are to be recognized as
+newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
+PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
+sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above),
+PCRE2_NEWLINE_ANY (any Unicode newline sequence), or PCRE2_NEWLINE_NUL (the
+NUL character, that is a binary zero).
+
+A pattern can override the value set in the compile context by starting with a +sequence such as (*CRLF). See the +pcre2pattern +page for details. +
+
+When a pattern is compiled with the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE
+option, the newline convention affects the recognition of the end of internal
+comments starting with #. The value is saved with the compiled pattern for
+subsequent use by the JIT compiler and by the two interpreted matching
+functions, pcre2_match() and pcre2_dfa_match().
+
+
+int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
+ uint32_t value);
+
+
+This parameter adjusts the limit, set when PCRE2 is built (default 250), on the
+depth of parenthesis nesting in a pattern. This limit stops rogue patterns
+using up too much system stack when being compiled. The limit applies to
+parentheses of all kinds, not just capturing parentheses.
+
+
+int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
+ int (*guard_function)(uint32_t, void *), void *user_data);
+
+
+There is at least one application that runs PCRE2 in threads with very limited
+system stack, where running out of stack is to be avoided at all costs. The
+parenthesis limit above cannot take account of how much stack is actually
+available during compilation. For a finer control, you can supply a function
+that is called whenever pcre2_compile() starts to compile a parenthesized
+part of a pattern. This function can check the actual stack size (or anything
+else that it wants to, of course).
+
+The first argument to the callout function gives the current depth of +nesting, and the second is user data that is set up by the last argument of +pcre2_set_compile_recursion_guard(). The callout function should return +zero if all is well, or non-zero to force an error. +
++A match context is required if you want to: +
+ Set up a callout function + Set an offset limit for matching an unanchored pattern + Change the limit on the amount of heap used when matching + Change the backtracking match limit + Change the backtracking depth limit + Set custom memory management specifically for the match ++If none of these apply, just pass NULL as the context argument of +pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match(). + +
+A match context is created, copied, and freed by the following functions:
+
+
+pcre2_match_context *pcre2_match_context_create(
+ pcre2_general_context *gcontext);
+
+
+pcre2_match_context *pcre2_match_context_copy(
+ pcre2_match_context *mcontext);
+
+
+void pcre2_match_context_free(pcre2_match_context *mcontext);
+
+
+A match context is created with default values for its parameters. These can
+be changed by calling the following functions, which return 0 on success, or
+PCRE2_ERROR_BADDATA if invalid data is detected.
+
+
+int pcre2_set_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_callout_block *, void *),
+ void *callout_data);
+
+
+This sets up a callout function for PCRE2 to call at specified points
+during a matching operation. Details are given in the
+pcre2callout
+documentation.
+
+
+int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_substitute_callout_block *, void *),
+ void *callout_data);
+
+
+This sets up a callout function for PCRE2 to call after each substitution
+made by pcre2_substitute(). Details are given in the section entitled
+"Creating a new string with substitutions"
+below.
+
+
+int pcre2_set_offset_limit(pcre2_match_context *mcontext,
+ PCRE2_SIZE value);
+
+
+The offset_limit parameter limits how far an unanchored search can
+advance in the subject string. The default value is PCRE2_UNSET. The
+pcre2_match() and pcre2_dfa_match() functions return
+PCRE2_ERROR_NOMATCH if a match with a starting point before or at the given
+offset is not found. The pcre2_substitute() function makes no more
+substitutions.
+
+For example, if the pattern /abc/ is matched against "123abc" with an offset +limit less than 3, the result is PCRE2_ERROR_NOMATCH. A match can never be +found if the startoffset argument of pcre2_match(), +pcre2_dfa_match(), or pcre2_substitute() is greater than the offset +limit set in the match context. +
++When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT option when +calling pcre2_compile() so that when JIT is in use, different code can be +compiled. If a match is started with a non-default match limit when +PCRE2_USE_OFFSET_LIMIT is not set, an error is generated. +
+
+The offset limit facility can be used to track progress when searching large
+subject strings or to limit the extent of global substitutions. See also the
+PCRE2_FIRSTLINE option, which requires a match to start before or at the first
+newline that follows the start of matching in the subject. If this is set with
+an offset limit, a match must occur in the first line and also within the
+offset limit. In other words, whichever limit comes first is used.
+
+
+int pcre2_set_heap_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+
+The heap_limit parameter specifies, in units of kibibytes (1024 bytes),
+the maximum amount of heap memory that pcre2_match() may use to hold
+backtracking information when running an interpretive match. This limit also
+applies to pcre2_dfa_match(), which may use the heap when processing
+patterns with a lot of nested pattern recursion or lookarounds or atomic
+groups. This limit does not apply to matching with the JIT optimization, which
+has its own memory control arrangements (see the
+pcre2jit
+documentation for more details). If the limit is reached, the negative error
+code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
+is built; if it is not, the default is set very large and is essentially
+unlimited.
+
+A value for the heap limit may also be supplied by an item at the start of a +pattern of the form +
+ (*LIMIT_HEAP=ddd) ++where ddd is a decimal number. However, such a setting is ignored unless ddd is +less than the limit set by the caller of pcre2_match() or, if no such +limit is set, less than the default. + +
+The pcre2_match() function always needs some heap memory, so setting a +value of zero guarantees a "heap limit exceeded" error. Details of how +pcre2_match() uses the heap are given in the +pcre2perform +documentation. +
+
+For pcre2_dfa_match(), a vector on the system stack is used when
+processing pattern recursions, lookarounds, or atomic groups, and only if this
+is not big enough is heap memory used. In this case, setting a value of zero
+disables the use of the heap.
+
+
+int pcre2_set_match_limit(pcre2_match_context *mcontext,
+ uint32_t value);
+
+
+The match_limit parameter provides a means of preventing PCRE2 from using
+up too many computing resources when processing patterns that are not going to
+match, but which have a very large number of possibilities in their search
+trees. The classic example is a pattern that uses nested unlimited repeats.
+
+There is an internal counter in pcre2_match() that is incremented each +time round its main matching loop. If this value reaches the match limit, +pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT. This has +the effect of limiting the amount of backtracking that can take place. For +patterns that are not anchored, the count restarts from zero for each position +in the subject string. This limit also applies to pcre2_dfa_match(), +though the counting is done in a different way. +
++When pcre2_match() is called with a pattern that was successfully +processed by pcre2_jit_compile(), the way in which matching is executed +is entirely different. However, there is still the possibility of runaway +matching that goes on for a very long time, and so the match_limit value +is also used in this case (but in a different way) to limit how long the +matching can continue. +
++The default value for the limit can be set when PCRE2 is built; the default is +10 million, which handles all but the most extreme cases. A value for the match +limit may also be supplied by an item at the start of a pattern of the form +
+ (*LIMIT_MATCH=ddd) ++where ddd is a decimal number. However, such a setting is ignored unless ddd is +less than the limit set by the caller of pcre2_match() or +pcre2_dfa_match() or, if no such limit is set, less than the default. +
+The depth limit is not relevant, and is ignored, when matching is done using +JIT compiled code. However, it is supported by pcre2_dfa_match(), which +uses it to limit the depth of nested internal recursive function calls that +implement atomic groups, lookaround assertions, and pattern recursions. This +limits, indirectly, the amount of system stack that is used. It was more useful +in versions before 10.32, when stack memory was used for local workspace +vectors for recursive function calls. From version 10.32, only local variables +are allocated on the stack and as each call uses only a few hundred bytes, even +a small stack can support quite a lot of recursion. +
++If the depth of internal recursive function calls is great enough, local +workspace vectors are allocated on the heap from version 10.32 onwards, so the +depth limit also indirectly limits the amount of heap memory that is used. A +recursive pattern such as /(.(?2))((?1)|)/, when matched to a very long string +using pcre2_dfa_match(), can use a great deal of memory. However, it is +probably better to limit heap usage directly by calling +pcre2_set_heap_limit(). +
++The default value for the depth limit can be set when PCRE2 is built; if it is +not, the default is set to the same value as the default for the match limit. +If the limit is exceeded, pcre2_match() or pcre2_dfa_match() +returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth limit may also be +supplied by an item at the start of a pattern of the form +
+ (*LIMIT_DEPTH=ddd) ++where ddd is a decimal number. However, such a setting is ignored unless ddd is +less than the limit set by the caller of pcre2_match() or +pcre2_dfa_match() or, if no such limit is set, less than the default. + +
+int pcre2_config(uint32_t what, void *where); +
++The function pcre2_config() makes it possible for a PCRE2 client to find +the value of certain configuration parameters and to discover which optional +features have been compiled into the PCRE2 library. The +pcre2build +documentation has more details about these features. +
++The first argument for pcre2_config() specifies which information is +required. The second argument is a pointer to memory into which the information +is placed. If NULL is passed, the function returns the amount of memory that is +needed for the requested information. For calls that return numerical values, +the value is in bytes; when requesting these values, where should point +to appropriately aligned memory. For calls that return strings, the required +length is given in code units, not counting the terminating zero. +
++When requesting information, the returned value from pcre2_config() is +non-negative on success, or the negative error code PCRE2_ERROR_BADOPTION if +the value in the first argument is not recognized. The following information is +available: +
+ PCRE2_CONFIG_BSR ++The output is a uint32_t integer whose value indicates what character +sequences the \R escape sequence matches by default. A value of +PCRE2_BSR_UNICODE means that \R matches any Unicode line ending sequence; a +value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The +default can be overridden when a pattern is compiled. +
+ PCRE2_CONFIG_COMPILED_WIDTHS ++The output is a uint32_t integer whose lower bits indicate which code unit +widths were selected when PCRE2 was built. The 1-bit indicates 8-bit support, +and the 2-bit and 4-bit indicate 16-bit and 32-bit support, respectively. +
+ PCRE2_CONFIG_DEPTHLIMIT ++The output is a uint32_t integer that gives the default limit for the depth of +nested backtracking in pcre2_match() or the depth of nested recursions, +lookarounds, and atomic groups in pcre2_dfa_match(). Further details are +given with pcre2_set_depth_limit() above. +
+ PCRE2_CONFIG_HEAPLIMIT ++The output is a uint32_t integer that gives, in kibibytes, the default limit +for the amount of heap memory used by pcre2_match() or +pcre2_dfa_match(). Further details are given with +pcre2_set_heap_limit() above. +
+ PCRE2_CONFIG_JIT ++The output is a uint32_t integer that is set to one if support for just-in-time +compiling is included in the library; otherwise it is set to zero. Note that +having the support in the library does not guarantee that JIT will be used for +any given match. See the +pcre2jit +documentation for more details. +
+ PCRE2_CONFIG_JITTARGET ++The where argument should point to a buffer that is at least 48 code +units long. (The exact length required can be found by calling +pcre2_config() with where set to NULL.) The buffer is filled with a +string that contains the name of the architecture for which the JIT compiler is +configured, for example "x86 32bit (little endian + unaligned)". If JIT support +is not available, PCRE2_ERROR_BADOPTION is returned, otherwise the number of +code units used is returned. This is the length of the string, plus one unit +for the terminating zero. +
+ PCRE2_CONFIG_LINKSIZE ++The output is a uint32_t integer that contains the number of bytes used for +internal linkage in compiled regular expressions. When PCRE2 is configured, the +value can be set to 2, 3, or 4, with the default being 2. This is the value +that is returned by pcre2_config(). However, when the 16-bit library is +compiled, a value of 3 is rounded up to 4, and when the 32-bit library is +compiled, internal linkages always use 4 bytes, so the configured value is not +relevant. + +
+The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all +but the most massive patterns, since it allows the size of the compiled pattern +to be up to 65535 code units. Larger values allow larger regular expressions to +be compiled by those two libraries, but at the expense of slower matching. +
+ PCRE2_CONFIG_MATCHLIMIT ++The output is a uint32_t integer that gives the default match limit for +pcre2_match(). Further details are given with +pcre2_set_match_limit() above. +
+ PCRE2_CONFIG_NEWLINE ++The output is a uint32_t integer whose value specifies the default character +sequence that is recognized as meaning "newline". The values are: +
+ PCRE2_NEWLINE_CR Carriage return (CR) + PCRE2_NEWLINE_LF Linefeed (LF) + PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) + PCRE2_NEWLINE_ANY Any Unicode line ending + PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF + PCRE2_NEWLINE_NUL The NUL character (binary zero) ++The default should normally correspond to the standard sequence for your +operating system. +
+ PCRE2_CONFIG_NEVER_BACKSLASH_C ++The output is a uint32_t integer that is set to one if the use of \C was +permanently disabled when PCRE2 was built; otherwise it is set to zero. +
+ PCRE2_CONFIG_PARENSLIMIT ++The output is a uint32_t integer that gives the maximum depth of nesting +of parentheses (of any kind) in a pattern. This limit is imposed to cap the +amount of system stack used when a pattern is compiled. It is specified when +PCRE2 is built; the default is 250. This limit does not take into account the +stack that may already be used by the calling application. For finer control +over compilation stack usage, see pcre2_set_compile_recursion_guard(). +
+ PCRE2_CONFIG_STACKRECURSE ++This parameter is obsolete and should not be used in new code. The output is a +uint32_t integer that is always set to zero. +
+ PCRE2_CONFIG_TABLES_LENGTH ++The output is a uint32_t integer that gives the length of PCRE2's character +processing tables in bytes. For details of these tables see the +section on locale support +below. +
+ PCRE2_CONFIG_UNICODE_VERSION ++The where argument should point to a buffer that is at least 24 code +units long. (The exact length required can be found by calling +pcre2_config() with where set to NULL.) If PCRE2 has been compiled +without Unicode support, the buffer is filled with the text "Unicode not +supported". Otherwise, the Unicode version string (for example, "8.0.0") is +inserted. The number of code units used is returned. This is the length of the +string plus one unit for the terminating zero. +
+ PCRE2_CONFIG_UNICODE ++The output is a uint32_t integer that is set to one if Unicode support is +available; otherwise it is set to zero. Unicode support implies UTF support. +
+ PCRE2_CONFIG_VERSION ++The where argument should point to a buffer that is at least 24 code +units long. (The exact length required can be found by calling +pcre2_config() with where set to NULL.) The buffer is filled with +the PCRE2 version string, zero-terminated. The number of code units used is +returned. This is the length of the string plus one unit for the terminating +zero. + +
+pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
+ uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
+ pcre2_compile_context *ccontext);
+
+
+void pcre2_code_free(pcre2_code *code);
+
+
+pcre2_code *pcre2_code_copy(const pcre2_code *code);
+
+
+pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
+
+The pcre2_compile() function compiles a pattern into an internal form. +The pattern is defined by a pointer to a string of code units and a length in +code units. If the pattern is zero-terminated, the length can be specified as +PCRE2_ZERO_TERMINATED. A NULL pattern pointer with a length of zero is treated +as an empty string (NULL with a non-zero length causes an error return). The +function returns a pointer to a block of memory that contains the compiled +pattern and related data, or NULL if an error occurred. +
++If the compile context argument ccontext is NULL, memory for the compiled +pattern is obtained by calling malloc(). Otherwise, it is obtained from +the same memory function that was used for the compile context. The caller must +free the memory by calling pcre2_code_free() when it is no longer needed. +If pcre2_code_free() is called with a NULL argument, it returns +immediately, without doing anything. +
++The function pcre2_code_copy() makes a copy of the compiled code in new +memory, using the same memory allocator as was used for the original. However, +if the code has been processed by the JIT compiler (see +below), +the JIT information cannot be copied (because it is position-dependent). +The new copy can initially be used only for non-JIT matching, though it can be +passed to pcre2_jit_compile() if required. If pcre2_code_copy() is +called with a NULL argument, it returns NULL. +
++The pcre2_code_copy() function provides a way for individual threads in a +multithreaded application to acquire a private copy of shared compiled code. +However, it does not make a copy of the character tables used by the compiled +pattern; the new pattern code points to the same tables as the original code. +(See +"Locale Support" +below for details of these character tables.) In many applications the same +tables are used throughout, so this behaviour is appropriate. Nevertheless, +there are occasions when a copy of a compiled pattern and the relevant tables +are needed. The pcre2_code_copy_with_tables() provides this facility. +Copies of both the code and the tables are made, with the new code pointing to +the new tables. The memory for the new tables is automatically freed when +pcre2_code_free() is called for the new copy of the compiled code. If +pcre2_code_copy_with_tables() is called with a NULL argument, it returns +NULL. +
++NOTE: When one of the matching functions is called, pointers to the compiled +pattern and the subject string are set in the match data block so that they can +be referenced by the substring extraction functions after a successful match. +After running a match, you must not free a compiled pattern or a subject string +until after all operations on the +match data block +have taken place, unless, in the case of the subject string, you have used the +PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled +"Option bits for pcre2_match()" +below. +
++The options argument for pcre2_compile() contains various bit +settings that affect the compilation. It should be zero if none of them are +required. The available options are described below. Some of them (in +particular, those that are compatible with Perl, but some others as well) can +also be set and unset from within the pattern (see the detailed description in +the +pcre2pattern +documentation). +
++For those options that can be different in different parts of the pattern, the +contents of the options argument specifies their settings at the start of +compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and PCRE2_NO_UTF_CHECK +options can be set at the time of matching as well as at compile time. +
++Some additional options and less frequently required compile-time parameters +(for example, the newline setting) can be provided in a compile context (as +described +above). +
++If errorcode or erroroffset is NULL, pcre2_compile() returns +NULL immediately. Otherwise, the variables to which these point are set to an +error code and an offset (number of code units) within the pattern, +respectively, when pcre2_compile() returns NULL because a compilation +error has occurred. +
++There are nearly 100 positive error codes that pcre2_compile() may return +if it finds an error in the pattern. There are also some negative error codes +that are used for invalid UTF strings when validity checking is in force. These +are the same as given by pcre2_match() and pcre2_dfa_match(), and +are described in the +pcre2unicode +documentation. There is no separate documentation for the positive error codes, +because the textual error messages that are obtained by calling the +pcre2_get_error_message() function (see "Obtaining a textual error +message" +below) +should be self-explanatory. Macro names starting with PCRE2_ERROR_ are defined +for both positive and negative error codes in pcre2.h. When compilation +is successful errorcode is set to a value that returns the message "no +error" if passed to pcre2_get_error_message(). +
++The value returned in erroroffset is an indication of where in the +pattern an error occurred. When there is no error, zero is returned. A non-zero +value is not necessarily the furthest point in the pattern that was read. For +example, after the error "lookbehind assertion is not fixed length", the error +offset points to the start of the failing assertion. For an invalid UTF-8 or +UTF-16 string, the offset is that of the first code unit of the failing +character. +
++Some errors are not detected until the whole pattern has been scanned; in these +cases, the offset passed back is the length of the pattern. Note that the +offset is in code units, not characters, even in a UTF mode. It may sometimes +point into the middle of a UTF-8 or UTF-16 character. +
++This code fragment shows a typical straightforward call to +pcre2_compile(): +
+ pcre2_code *re; + PCRE2_SIZE erroffset; + int errorcode; + re = pcre2_compile( + "^A.*Z", /* the pattern */ + PCRE2_ZERO_TERMINATED, /* the pattern is zero-terminated */ + 0, /* default options */ + &errorcode, /* for error code */ + &erroffset, /* for error offset */ + NULL); /* no compile context */ + ++ +
+The following names for option bits are defined in the pcre2.h header +file: +
+ PCRE2_ANCHORED ++If this bit is set, the pattern is forced to be "anchored", that is, it is +constrained to match only at the first matching point in the string that is +being searched (the "subject string"). This effect can also be achieved by +appropriate constructs in the pattern itself, which is the only way to do it in +Perl. +
+ PCRE2_ALLOW_EMPTY_CLASS ++By default, for compatibility with Perl, a closing square bracket that +immediately follows an opening one is treated as a data character for the +class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the class, which +therefore contains no characters and so can never match. +
+ PCRE2_ALT_BSUX ++This option request alternative handling of three escape sequences, which +makes PCRE2's behaviour more like ECMAscript (aka JavaScript). When it is set: + +
+(1) \U matches an upper case "U" character; by default \U causes a compile +time error (Perl uses \U to upper case subsequent characters). +
++(2) \u matches a lower case "u" character unless it is followed by four +hexadecimal digits, in which case the hexadecimal number defines the code point +to match. By default, \u causes a compile time error (Perl uses it to upper +case the following character). +
++(3) \x matches a lower case "x" character unless it is followed by two +hexadecimal digits, in which case the hexadecimal number defines the code point +to match. By default, as in Perl, a hexadecimal number is always expected after +\x, but it may have zero, one, or two digits (so, for example, \xz matches a +binary zero character followed by z). +
++ECMAscript 6 added additional functionality to \u. This can be accessed using +the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile options" +below). +Note that this alternative escape handling applies only to patterns. Neither of +these options affects the processing of replacement strings passed to +pcre2_substitute(). +
+ PCRE2_ALT_CIRCUMFLEX ++In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter +matches at the start of the subject (unless PCRE2_NOTBOL is set), and also +after any internal newline. However, it does not match after a newline at the +end of the subject, for compatibility with Perl. If you want a multiline +circumflex also to match after a terminating newline, you must set +PCRE2_ALT_CIRCUMFLEX. +
+ PCRE2_ALT_VERBNAMES ++By default, for compatibility with Perl, the name in any verb sequence such as +(*MARK:NAME) is any sequence of characters that does not include a closing +parenthesis. The name is not processed in any way, and it is not possible to +include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES +option is set, normal backslash processing is applied to verb names and only an +unescaped closing parenthesis terminates the name. A closing parenthesis can be +included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED +or PCRE2_EXTENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped +whitespace in verb names is skipped and #-comments are recognized, exactly as +in the rest of the pattern. +
+ PCRE2_AUTO_CALLOUT ++If this bit is set, pcre2_compile() automatically inserts callout items, +all with number 255, before each pattern item, except immediately before or +after an explicit callout in the pattern. For discussion of the callout +facility, see the +pcre2callout +documentation. +
+ PCRE2_CASELESS ++If this bit is set, letters in the pattern match both upper and lower case +letters in the subject. It is equivalent to Perl's /i option, and it can be +changed within a pattern by a (?i) option setting. If either PCRE2_UTF or +PCRE2_UCP is set, Unicode properties are used for all characters with more than +one other case, and for all characters whose code points are greater than +U+007F. Note that there are two ASCII characters, K and S, that, in addition to +their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin +sign) and U+017F (long S) respectively. If you do not want this case +equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT. + +
+For lower valued characters with only one other case, a lookup table is used +for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used +for all code points less than 256, and higher code points (available only in +16-bit or 32-bit mode) are treated as not having another case. +
+ PCRE2_DOLLAR_ENDONLY ++If this bit is set, a dollar metacharacter in the pattern matches only at the +end of the subject string. Without this option, a dollar also matches +immediately before a newline at the end of the string (but not before any other +newlines). The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is +set. There is no equivalent to this option in Perl, and no way to set it within +a pattern. +
+ PCRE2_DOTALL ++If this bit is set, a dot metacharacter in the pattern matches any character, +including one that indicates a newline. However, it only ever matches one +character, even if newlines are coded as CRLF. Without this option, a dot does +not match when the current position in the subject is at a newline. This option +is equivalent to Perl's /s option, and it can be changed within a pattern by a +(?s) option setting. A negative class such as [^a] always matches newline +characters, and the \N escape sequence always matches a non-newline character, +independent of the setting of PCRE2_DOTALL. +
+ PCRE2_DUPNAMES ++If this bit is set, names used to identify capture groups need not be unique. +This can be helpful for certain types of pattern when it is known that only one +instance of the named group can ever be matched. There are more details of +named capture groups below; see also the +pcre2pattern +documentation. +
+ PCRE2_ENDANCHORED ++If this bit is set, the end of any pattern match must be right at the end of +the string being searched (the "subject string"). If the pattern match +succeeds by reaching (*ACCEPT), but does not reach the end of the subject, the +match fails at the current starting point. For unanchored patterns, a new match +is then tried at the next starting point. However, if the match succeeds by +reaching the end of the pattern, but not the end of the subject, backtracking +occurs and an alternative match may be found. Consider these two patterns: +
+ .(*ACCEPT)|.. + .|.. ++If matched against "abc" with PCRE2_ENDANCHORED set, the first matches "c" +whereas the second matches "bc". The effect of PCRE2_ENDANCHORED can also be +achieved by appropriate constructs in the pattern itself, which is the only way +to do it in Perl. + +
+For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only +to the first (that is, the longest) matched string. Other parallel matches, +which are necessarily substrings of the first one, must obviously end before +the end of the subject. +
+ PCRE2_EXTENDED ++If this bit is set, most white space characters in the pattern are totally +ignored except when escaped, inside a character class, or inside a \Q...\E +sequence. However, white space is not allowed within sequences such as (?> that +introduce various parenthesized groups, nor within numerical quantifiers such +as {1,3}. Ignorable white space is permitted between an item and a following +quantifier and between a quantifier and a following + that indicates +possessiveness. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be +changed within a pattern by a (?x) option setting. + +
+When PCRE2 is compiled without Unicode support, PCRE2_EXTENDED recognizes as +white space only those characters with code points less than 256 that are +flagged as white space in its low-character table. The table is normally +created by +pcre2_maketables(), +which uses the isspace() function to identify space characters. In most +ASCII environments, the relevant characters are those with code points 0x0009 +(tab), 0x000A (linefeed), 0x000B (vertical tab), 0x000C (formfeed), 0x000D +(carriage return), and 0x0020 (space). +
++When PCRE2 is compiled with Unicode support, in addition to these characters, +five more Unicode "Pattern White Space" characters are recognized by +PCRE2_EXTENDED. These are U+0085 (next line), U+200E (left-to-right mark), +U+200F (right-to-left mark), U+2028 (line separator), and U+2029 (paragraph +separator). This set of characters is the same as recognized by Perl's /x +option. Note that the horizontal and vertical space characters that are matched +by the \h and \v escapes in patterns are a much bigger set. +
++As well as ignoring most white space, PCRE2_EXTENDED also causes characters +between an unescaped # outside a character class and the next newline, +inclusive, to be ignored, which makes it possible to include comments inside +complicated patterns. Note that the end of this type of comment is a literal +newline sequence in the pattern; escape sequences that happen to represent a +newline do not count. +
++Which characters are interpreted as newlines can be specified by a setting in +the compile context that is passed to pcre2_compile() or by a special +sequence at the start of the pattern, as described in the section entitled +"Newline conventions" +in the pcre2pattern documentation. A default is defined when PCRE2 is +built. +
+ PCRE2_EXTENDED_MORE ++This option has the effect of PCRE2_EXTENDED, but, in addition, unescaped space +and horizontal tab characters are ignored inside a character class. Note: only +these two characters are ignored, not the full set of pattern white space +characters that are ignored outside a character class. PCRE2_EXTENDED_MORE is +equivalent to Perl's /xx option, and it can be changed within a pattern by a +(?xx) option setting. +
+ PCRE2_FIRSTLINE ++If this option is set, the start of an unanchored pattern match must be before +or at the first newline in the subject string following the start of matching, +though the matched text may continue over the newline. If startoffset is +non-zero, the limiting newline is not necessarily the first newline in the +subject. For example, if the subject string is "abc\nxyz" (where \n +represents a single-character newline) a pattern match for "yz" succeeds with +PCRE2_FIRSTLINE if startoffset is greater than 3. See also +PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If +PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first +line and also within the offset limit. In other words, whichever limit comes +first is used. This option has no effect for anchored patterns. +
+ PCRE2_LITERAL ++If this option is set, all meta-characters in the pattern are disabled, and it +is treated as a literal string. Matching literal strings with a regular +expression engine is not the most efficient way of doing it. If you are doing a +lot of literal matching and are worried about efficiency, you should consider +using other approaches. The only other main options that are allowed with +PCRE2_LITERAL are: PCRE2_ANCHORED, PCRE2_ENDANCHORED, PCRE2_AUTO_CALLOUT, +PCRE2_CASELESS, PCRE2_FIRSTLINE, PCRE2_MATCH_INVALID_UTF, +PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_UTF, and +PCRE2_USE_OFFSET_LIMIT. The extra options PCRE2_EXTRA_MATCH_LINE and +PCRE2_EXTRA_MATCH_WORD are also supported. Any other options cause an error. +
+ PCRE2_MATCH_INVALID_UTF ++This option forces PCRE2_UTF (see below) and also enables support for matching +by pcre2_match() in subject strings that contain invalid UTF sequences. +Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as +sequences of uint16_t or uint32_t code points. They cannot find valid UTF +sequences within an arbitrary string of bytes unless such sequences are +suitably aligned. This facility is not supported for DFA matching. For details, +see the +pcre2unicode +documentation. +
+ PCRE2_MATCH_UNSET_BACKREF ++If this option is set, a backreference to an unset capture group matches an +empty string (by default this causes the current matching alternative to fail). +A pattern such as (\1)(a) succeeds when this option is set (assuming it can +find an "a" in the subject), whereas it fails by default, for Perl +compatibility. Setting this option makes PCRE2 behave more like ECMAscript (aka +JavaScript). +
+ PCRE2_MULTILINE ++By default, for the purposes of matching "start of line" and "end of line", +PCRE2 treats the subject string as consisting of a single line of characters, +even if it actually contains newlines. The "start of line" metacharacter (^) +matches only at the start of the string, and the "end of line" metacharacter +($) matches only at the end of the string, or before a terminating newline +(except when PCRE2_DOLLAR_ENDONLY is set). Note, however, that unless +PCRE2_DOTALL is set, the "any character" metacharacter (.) does not match at a +newline. This behaviour (for ^, $, and dot) is the same as Perl. + +
+When PCRE2_MULTILINE it is set, the "start of line" and "end of line" +constructs match immediately following or immediately before internal newlines +in the subject string, respectively, as well as at the very start and end. This +is equivalent to Perl's /m option, and it can be changed within a pattern by a +(?m) option setting. Note that the "start of line" metacharacter does not match +after a newline at the end of the subject, for compatibility with Perl. +However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If +there are no newlines in a subject string, or no occurrences of ^ or $ in a +pattern, setting PCRE2_MULTILINE has no effect. +
+ PCRE2_NEVER_BACKSLASH_C ++This option locks out the use of \C in the pattern that is being compiled. +This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because +it may leave the current matching point in the middle of a multi-code-unit +character. This option may be useful in applications that process patterns from +external sources. Note that there is also a build-time option that permanently +locks out the use of \C. +
+ PCRE2_NEVER_UCP ++This option locks out the use of Unicode properties for handling \B, \b, \D, +\d, \S, \s, \W, \w, and some of the POSIX character classes, as described +for the PCRE2_UCP option below. In particular, it prevents the creator of the +pattern from enabling this facility by starting the pattern with (*UCP). This +option may be useful in applications that process patterns from external +sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +
+ PCRE2_NEVER_UTF ++This option locks out interpretation of the pattern as UTF-8, UTF-16, or +UTF-32, depending on which library is in use. In particular, it prevents the +creator of the pattern from switching to UTF interpretation by starting the +pattern with (*UTF). This option may be useful in applications that process +patterns from external sources. The combination of PCRE2_UTF and +PCRE2_NEVER_UTF causes an error. +
+ PCRE2_NO_AUTO_CAPTURE ++If this option is set, it disables the use of numbered capturing parentheses in +the pattern. Any opening parenthesis that is not followed by ? behaves as if it +were followed by ?: but named parentheses can still be used for capturing (and +they acquire numbers in the usual way). This is the same as Perl's /n option. +Note that, when this option is set, references to capture groups +(backreferences or recursion/subroutine calls) may only refer to named groups, +though the reference can be by name or by number. +
+ PCRE2_NO_AUTO_POSSESS ++If this option is set, it disables "auto-possessification", which is an +optimization that, for example, turns a+b into a++b in order to avoid +backtracks into a+ that can never be successful. However, if callouts are in +use, auto-possessification means that some callouts are never taken. You can +set this option if you want the matching functions to do a full unoptimized +search and run all the callouts, but it is mainly provided for testing +purposes. +
+ PCRE2_NO_DOTSTAR_ANCHOR ++If this option is set, it disables an optimization that is applied when .* is +the first significant item in a top-level branch of a pattern, and all the +other branches also start with .* or with \A or \G or ^. The optimization is +automatically disabled for .* if it is inside an atomic group or a capture +group that is the subject of a backreference, or if the pattern contains +(*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is +automatically anchored if PCRE2_DOTALL is set for all the .* items and +PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match +must start either at the start of the subject or following a newline is +remembered. Like other optimizations, this can cause callouts to be skipped. +
+ PCRE2_NO_START_OPTIMIZE ++This is an option whose main effect is at matching time. It does not change +what pcre2_compile() generates, but it does affect the output of the JIT +compiler. + +
+There are a number of optimizations that may occur at the start of a match, in +order to speed up the process. For example, if it is known that an unanchored +match must start with a specific code unit value, the matching code searches +the subject for that value, and fails immediately if it cannot find it, without +actually running the main matching function. This means that a special item +such as (*COMMIT) at the start of a pattern is not considered until after a +suitable starting point for the match has been found. Also, when callouts or +(*MARK) items are in use, these "start-up" optimizations can cause them to be +skipped if the pattern is never actually used. The start-up optimizations are +in effect a pre-scan of the subject that takes place before the pattern is run. +
++The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, +possibly causing performance to suffer, but ensuring that in cases where the +result is "no match", the callouts do occur, and that items such as (*COMMIT) +and (*MARK) are considered at every possible starting position in the subject +string. +
++Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation. +Consider the pattern +
+ (*COMMIT)ABC ++When this is compiled, PCRE2 records the fact that a match must start with the +character "A". Suppose the subject string is "DEFABC". The start-up +optimization scans along the subject, finds "A" and runs the first match +attempt from there. The (*COMMIT) item means that the pattern must match the +current starting position, which in this case, it does. However, if the same +match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the +subject string does not happen. The first match attempt is run starting from +"D" and when this fails, (*COMMIT) prevents any further matches being tried, so +the overall result is "no match". + +
+As another start-up optimization makes use of a minimum length for a matching +subject, which is recorded when possible. Consider the pattern +
+ (*MARK:1)B(*MARK:2)(X|Y) ++The minimum length for a match is two characters. If the subject is "XXBB", the +"starting character" optimization skips "XX", then tries to match "BB", which +is long enough. In the process, (*MARK:2) is encountered and remembered. When +the match attempt fails, the next "B" is found, but there is only one character +left, so there are no more attempts, and "no match" is returned with the "last +mark seen" set to "2". If NO_START_OPTIMIZE is set, however, matches are tried +at every possible starting position, including at the end of the subject, where +(*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is +returned is "1". In this case, the optimizations do not affect the overall +match result, which is still "no match", but they do affect the auxiliary +information that is returned. +
+ PCRE2_NO_UTF_CHECK ++When PCRE2_UTF is set, the validity of the pattern as a UTF string is +automatically checked. There are discussions about the validity of +UTF-8 strings, +UTF-16 strings, +and +UTF-32 strings +in the +pcre2unicode +document. If an invalid UTF sequence is found, pcre2_compile() returns a +negative error code. + +
+If you know that your pattern is a valid UTF string, and you want to skip this +check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When +it is set, the effect of passing an invalid UTF string as a pattern is +undefined. It may cause your program to crash or loop. +
++Note that this option can also be passed to pcre2_match() and +pcre2_dfa_match(), to suppress UTF validity checking of the subject +string. +
++Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the +error that is given if an escape sequence for an invalid Unicode code point is +encountered in the pattern. In particular, the so-called "surrogate" code +points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences +such as \x{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra +option, as described in the section entitled "Extra compile options" +below. +However, this is possible only in UTF-8 and UTF-32 modes, because these values +are not representable in UTF-16. +
+ PCRE2_UCP ++This option has two effects. Firstly, it change the way PCRE2 processes \B, +\b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes. By +default, only ASCII characters are recognized, but if PCRE2_UCP is set, Unicode +properties are used to classify characters. There are some PCRE2_EXTRA +options (see below) that add finer control to this behaviour. More details are +given in the section on +generic character types +in the +pcre2pattern +page. + +
+The second effect of PCRE2_UCP is to force the use of Unicode properties for +upper/lower casing operations, even when PCRE2_UTF is not set. This makes it +possible to process strings in the 16-bit UCS-2 code. This option is available +only if PCRE2 has been compiled with Unicode support (which is the default). +The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless +matching such that ASCII characters match only ASCII characters and non-ASCII +characters match only non-ASCII characters. +
+ PCRE2_UNGREEDY ++This option inverts the "greediness" of the quantifiers so that they are not +greedy by default, but become greedy if followed by "?". It is not compatible +with Perl. It can also be set by a (?U) option setting within the pattern. +
+ PCRE2_USE_OFFSET_LIMIT ++This option must be set for pcre2_compile() if +pcre2_set_offset_limit() is going to be used to set a non-default offset +limit in a match context for matches that use this pattern. An error is +generated if an offset limit is set without this option. For more details, see +the description of pcre2_set_offset_limit() in the +section +that describes match contexts. See also the PCRE2_FIRSTLINE +option above. +
+ PCRE2_UTF ++This option causes PCRE2 to regard both the pattern and the subject strings +that are subsequently processed as strings of UTF characters instead of +single-code-unit strings. It is available when PCRE2 is built to include +Unicode support (which is the default). If Unicode support is not available, +the use of this option provokes an error. Details of how PCRE2_UTF changes the +behaviour of PCRE2 are given in the +pcre2unicode +page. In particular, note that it changes the way PCRE2_CASELESS works. + +
+The option bits that can be set in a compile context by calling the +pcre2_set_compile_extra_options() function are as follows: +
+ PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK ++Since release 10.38 PCRE2 has forbidden the use of \K within lookaround +assertions, following Perl's lead. This option is provided to re-enable the +previous behaviour (act in positive lookarounds, ignore in negative ones) in +case anybody is relying on it. +
+ PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES ++This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is +forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" +code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode +code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot +therefore be represented in UTF-16. They can be represented in UTF-8 and +UTF-32, but are defined as invalid code points, and cause errors if encountered +in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2. + +
+These values also cause errors if encountered in escape sequences such as +\x{d912} within a pattern. However, it seems that some applications, when +using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test +for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does +not disable the error that occurs, because it applies only to the testing of +input strings for UTF validity. +
++If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code +point values in UTF-8 and UTF-32 patterns no longer provoke errors and are +incorporated in the compiled pattern. However, they can only match subject +characters if the matching function is called with PCRE2_NO_UTF_CHECK set. +
+ PCRE2_EXTRA_ALT_BSUX ++The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and \x in +the way that ECMAscript (aka JavaScript) does. Additional functionality was +defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of +PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal +character code, where hhh.. is any number of hexadecimal digits. +
+ PCRE2_EXTRA_ASCII_BSD ++This option forces \d to match only ASCII digits, even when PCRE2_UCP is set. +It can be changed within a pattern by means of the (?aD) option setting. +
+ PCRE2_EXTRA_ASCII_BSS ++This option forces \s to match only ASCII space characters, even when +PCRE2_UCP is set. It can be changed within a pattern by means of the (?aS) +option setting. +
+ PCRE2_EXTRA_ASCII_BSW ++This option forces \w to match only ASCII word characters, even when PCRE2_UCP +is set. It can be changed within a pattern by means of the (?aW) option +setting. +
+ PCRE2_EXTRA_ASCII_DIGIT ++This option forces the POSIX character classes [:digit:] and [:xdigit:] to +match only ASCII digits, even when PCRE2_UCP is set. It can be changed within +a pattern by means of the (?aT) option setting. +
+ PCRE2_EXTRA_ASCII_POSIX ++This option forces all the POSIX character classes, including [:digit:] and +[:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can +be changed within a pattern by means of the (?aP) option setting, but note that +this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets +all ASCII restrictions for POSIX classes. +
+ PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL ++This is a dangerous option. Use with care. By default, an unrecognized escape +such as \j or a malformed one such as \x{2z} causes a compile-time error when +detected by pcre2_compile(). Perl is somewhat inconsistent in handling +such items: for example, \j is treated as a literal "j", and non-hexadecimal +digits in \x{} are just ignored, though warnings are given in both cases if +Perl's warning switch is enabled. However, a malformed octal number after \o{ +always causes an error in Perl. + +
+If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to +pcre2_compile(), all unrecognized or malformed escape sequences are +treated as single-character escapes. For example, \j is a literal "j" and +\x{2z} is treated as the literal string "x{2z}". Setting this option means +that typos in patterns may go undetected and have unexpected results. Also note +that a sequence such as [\N{] is interpreted as a malformed attempt at +[\N{...}] and so is treated as [N{] whereas [\N] gives an error because an +unqualified \N is a valid escape sequence but is not supported in a character +class. To reiterate: this is a dangerous option. Use with great care. +
+ PCRE2_EXTRA_CASELESS_RESTRICT ++When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode +rules, which allow for more than two cases per character. There are two +case-equivalent character sets that contain both ASCII and non-ASCII +characters. The ASCII letter S is case-equivalent to U+017f (long S) and the +ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables +recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a +caseless match, both characters must either be ASCII or non-ASCII. The option +can be changed with a pattern by the (?r) option setting. +
+ PCRE2_EXTRA_ESCAPED_CR_IS_LF ++There are some legacy applications where the escape sequence \r in a pattern +is expected to match a newline. If this option is set, \r in a pattern is +converted to \n so that it matches a LF (linefeed) instead of a CR (carriage +return) character. The option does not affect a literal CR in the pattern, nor +does it affect CR specified as an explicit code point such as \x{0D}. +
+ PCRE2_EXTRA_MATCH_LINE ++This option is provided for use by the -x option of pcre2grep. It +causes the pattern only to match complete lines. This is achieved by +automatically inserting the code for "^(?:" at the start of the compiled +pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched +line may be in the middle of the subject string. This option can be used with +PCRE2_LITERAL. +
+ PCRE2_EXTRA_MATCH_WORD ++This option is provided for use by the -w option of pcre2grep. It +causes the pattern only to match strings that have a word boundary at the start +and the end. This is achieved by automatically inserting the code for "\b(?:" +at the start of the compiled pattern and ")\b" at the end. The option may be +used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is +also set. + +
+int pcre2_jit_compile(pcre2_code *code, uint32_t options);
+
+
+int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
+ PCRE2_SIZE length, PCRE2_SIZE startoffset,
+ uint32_t options, pcre2_match_data *match_data,
+ pcre2_match_context *mcontext);
+
+
+void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
+
+
+pcre2_jit_stack *pcre2_jit_stack_create(size_t startsize,
+ size_t maxsize, pcre2_general_context *gcontext);
+
+
+void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
+ pcre2_jit_callback callback_function, void *callback_data);
+
+
+void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
+
+These functions provide support for JIT compilation, which, if the just-in-time +compiler is available, further processes a compiled pattern into machine code +that executes much faster than the pcre2_match() interpretive matching +function. Full details are given in the +pcre2jit +documentation. +
++JIT compilation is a heavyweight optimization. It can take some time for +patterns to be analyzed, and for one-off matches and simple patterns the +benefit of faster execution might be offset by a much slower compilation time. +Most (but not all) patterns can be optimized by the JIT compiler. +
+
+const uint8_t *pcre2_maketables(pcre2_general_context *gcontext);
+
+
+void pcre2_maketables_free(pcre2_general_context *gcontext,
+ const uint8_t *tables);
+
+PCRE2 handles caseless matching, and determines whether characters are letters, +digits, or whatever, by reference to a set of tables, indexed by character code +point. However, this applies only to characters whose code points are less than +256. By default, higher-valued code points never match escapes such as \w or +\d. +
++When PCRE2 is built with Unicode support (the default), certain Unicode +character properties can be tested with \p and \P, or, alternatively, the +PCRE2_UCP option can be set when a pattern is compiled; this causes \w and +friends to use Unicode property support instead of the built-in tables. +PCRE2_UCP also causes upper/lower casing operations on characters with code +points greater than 127 to use Unicode properties. These effects apply even +when PCRE2_UTF is not set. There are, however, some PCRE2_EXTRA options (see +above) that can be used to modify or suppress them. +
++The use of locales with Unicode is discouraged. If you are handling characters +with code points greater than 127, you should either use Unicode support, or +use locales, but not try to mix the two. +
++PCRE2 contains a built-in set of character tables that are used by default. +These are sufficient for many applications. Normally, the internal tables +recognize only ASCII characters. However, when PCRE2 is built, it is possible +to cause the internal tables to be rebuilt in the default "C" locale of the +local system, which may cause them to be different. +
++The built-in tables can be overridden by tables supplied by the application +that calls PCRE2. These may be created in a different locale from the default. +As more and more applications change to using Unicode, the need for this locale +support is expected to die away. +
++External tables are built by calling the pcre2_maketables() function, in +the relevant locale. The only argument to this function is a general context, +which can be used to pass a custom memory allocator. If the argument is NULL, +the system malloc() is used. The result can be passed to +pcre2_compile() as often as necessary, by creating a compile context and +calling pcre2_set_character_tables() to set the tables pointer therein. +
++For example, to build and use tables that are appropriate for the French locale +(where accented characters with values greater than 127 are treated as +letters), the following code could be used: +
+ setlocale(LC_CTYPE, "fr_FR"); + tables = pcre2_maketables(NULL); + ccontext = pcre2_compile_context_create(NULL); + pcre2_set_character_tables(ccontext, tables); + re = pcre2_compile(..., ccontext); ++The locale name "fr_FR" is used on Linux and other Unix-like systems; if you +are using Windows, the name for the French locale is "french". + +
+The pointer that is passed (via the compile context) to pcre2_compile() +is saved with the compiled pattern, and the same tables are used by the +matching functions. Thus, for any single pattern, compilation and matching both +happen in the same locale, but different patterns can be processed in different +locales. +
++It is the caller's responsibility to ensure that the memory containing the +tables remains available while they are still in use. When they are no longer +needed, you can discard them using pcre2_maketables_free(), which should +pass as its first parameter the same global context that was used to create the +tables. +
++The tables described above are just a sequence of binary bytes, which makes +them independent of hardware characteristics such as endianness or whether the +processor is 32-bit or 64-bit. A copy of the result of pcre2_maketables() +can therefore be saved in a file or elsewhere and re-used later, even in a +different program or on another computer. The size of the tables (number of +bytes) must be obtained by calling pcre2_config() with the +PCRE2_CONFIG_TABLES_LENGTH option because pcre2_maketables() does not +return this value. Note that the pcre2_dftables program, which is part of +the PCRE2 build system, can be used stand-alone to create a file that contains +a set of binary tables. See the +pcre2build +documentation for details. +
++int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where); +
++The pcre2_pattern_info() function returns general information about a +compiled pattern. For information about callouts, see the +next section. +The first argument for pcre2_pattern_info() is a pointer to the compiled +pattern. The second argument specifies which piece of information is required, +and the third argument is a pointer to a variable to receive the data. If the +third argument is NULL, the first argument is ignored, and the function returns +the size in bytes of the variable that is required for the information +requested. Otherwise, the yield of the function is zero for success, or one of +the following negative numbers: +
+ PCRE2_ERROR_NULL the argument code was NULL + PCRE2_ERROR_BADMAGIC the "magic number" was not found + PCRE2_ERROR_BADOPTION the value of what was invalid + PCRE2_ERROR_UNSET the requested field is not set ++The "magic number" is placed at the start of each compiled pattern as a simple +check against passing an arbitrary memory pointer. Here is a typical call of +pcre2_pattern_info(), to obtain the length of the compiled pattern: +
+ int rc; + size_t length; + rc = pcre2_pattern_info( + re, /* result of pcre2_compile() */ + PCRE2_INFO_SIZE, /* what is required */ + &length); /* where to put the data */ ++The possible values for the second argument are defined in pcre2.h, and +are as follows: +
+ PCRE2_INFO_ALLOPTIONS + PCRE2_INFO_ARGOPTIONS + PCRE2_INFO_EXTRAOPTIONS ++Return copies of the pattern's options. The third argument should point to a +uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that +were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOPTIONS returns +the compile options as modified by any top-level (*XXX) option settings such as +(*UTF) at the start of the pattern itself. PCRE2_INFO_EXTRAOPTIONS returns the +extra options that were set in the compile context by calling the +pcre2_set_compile_extra_options() function. + +
+For example, if the pattern /(*UTF)abc/ is compiled with the PCRE2_EXTENDED +option, the result for PCRE2_INFO_ALLOPTIONS is PCRE2_EXTENDED and PCRE2_UTF. +Option settings such as (?i) that can change within a pattern do not affect the +result of PCRE2_INFO_ALLOPTIONS, even if they appear right at the start of the +pattern. (This was different in some earlier releases.) +
++A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if +the first significant item in every top-level branch is one of the following: +
+ ^ unless PCRE2_MULTILINE is set + \A always + \G always + .* sometimes - see below ++When .* is the first significant item, anchoring is possible only when all the +following are true: +
+ .* is not in an atomic group + .* is not in a capture group that is the subject of a backreference + PCRE2_DOTALL is in force for .* + Neither (*PRUNE) nor (*SKIP) appears in the pattern + PCRE2_NO_DOTSTAR_ANCHOR is not set ++For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the +options returned for PCRE2_INFO_ALLOPTIONS. +
+ PCRE2_INFO_BACKREFMAX ++Return the number of the highest backreference in the pattern. The third +argument should point to a uint32_t variable. Named capture groups +acquire numbers as well as names, and these count towards the highest +backreference. Backreferences such as \4 or \g{12} match the captured +characters of the given group, but in addition, the check that a capture +group is set in a conditional group such as (?(3)a|b) is also a backreference. +Zero is returned if there are no backreferences. +
+ PCRE2_INFO_BSR ++The output is a uint32_t integer whose value indicates what character sequences +the \R escape sequence matches. A value of PCRE2_BSR_UNICODE means that \R +matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means +that \R matches only CR, LF, or CRLF. +
+ PCRE2_INFO_CAPTURECOUNT ++Return the highest capture group number in the pattern. In patterns where (?| +is not used, this is also the total number of capture groups. The third +argument should point to a uint32_t variable. +
+ PCRE2_INFO_DEPTHLIMIT ++If the pattern set a backtracking depth limit by including an item of the form +(*LIMIT_DEPTH=nnnn) at the start, the value is returned. The third argument +should point to a uint32_t integer. If no such value has been set, the call to +pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this +limit will only be used during matching if it is less than the limit set or +defaulted by the caller of the match function. +
+ PCRE2_INFO_FIRSTBITMAP ++In the absence of a single first code unit for a non-anchored pattern, +pcre2_compile() may construct a 256-bit table that defines a fixed set of +values for the first code unit in any match. For example, a pattern that starts +with [abc] results in a table with three bits set. When code unit values +greater than 255 are supported, the flag bit for 255 means "any code unit of +value 255 or above". If such a table was constructed, a pointer to it is +returned. Otherwise NULL is returned. The third argument should point to a +const uint8_t * variable. +
+ PCRE2_INFO_FIRSTCODETYPE ++Return information about the first code unit of any matched string, for a +non-anchored pattern. The third argument should point to a uint32_t +variable. If there is a fixed first value, for example, the letter "c" from a +pattern such as (cat|cow|coyote), 1 is returned, and the value can be retrieved +using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is +known that a match can occur only at the start of the subject or following a +newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0 +is returned. +
+ PCRE2_INFO_FIRSTCODEUNIT ++Return the value of the first code unit of any matched string for a pattern +where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third +argument should point to a uint32_t variable. In the 8-bit library, the +value is always less than 256. In the 16-bit library the value can be up to +0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff, +and up to 0xffffffff when not using UTF-32 mode. +
+ PCRE2_INFO_FRAMESIZE ++Return the size (in bytes) of the data frames that are used to remember +backtracking positions when the pattern is processed by pcre2_match() +without the use of JIT. The third argument should point to a size_t +variable. The frame size depends on the number of capturing parentheses in the +pattern. Each additional capture group adds two PCRE2_SIZE variables. +
+ PCRE2_INFO_HASBACKSLASHC ++Return 1 if the pattern contains any instances of \C, otherwise 0. The third +argument should point to a uint32_t variable. +
+ PCRE2_INFO_HASCRORLF ++Return 1 if the pattern contains any explicit matches for CR or LF characters, +otherwise 0. The third argument should point to a uint32_t variable. An +explicit match is either a literal CR or LF character, or \r or \n or one of +the equivalent hexadecimal or octal escape sequences. +
+ PCRE2_INFO_HEAPLIMIT ++If the pattern set a heap memory limit by including an item of the form +(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument +should point to a uint32_t integer. If no such value has been set, the call to +pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this +limit will only be used during matching if it is less than the limit set or +defaulted by the caller of the match function. +
+ PCRE2_INFO_JCHANGED ++Return 1 if the (?J) or (?-J) option setting is used in the pattern, otherwise +0. The third argument should point to a uint32_t variable. (?J) and +(?-J) set and unset the local PCRE2_DUPNAMES option, respectively. +
+ PCRE2_INFO_JITSIZE ++If the compiled pattern was successfully processed by +pcre2_jit_compile(), return the size of the JIT compiled code, otherwise +return zero. The third argument should point to a size_t variable. +
+ PCRE2_INFO_LASTCODETYPE ++Returns 1 if there is a rightmost literal code unit that must exist in any +matched string, other than at its start. The third argument should point to a +uint32_t variable. If there is no such value, 0 is returned. When 1 is +returned, the code unit value itself can be retrieved using +PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is +recorded only if it follows something of variable length. For example, for the +pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned from +PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0. +
+ PCRE2_INFO_LASTCODEUNIT ++Return the value of the rightmost literal code unit that must exist in any +matched string, other than at its start, for a pattern where +PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argument +should point to a uint32_t variable. +
+ PCRE2_INFO_MATCHEMPTY ++Return 1 if the pattern might match an empty string, otherwise 0. The third +argument should point to a uint32_t variable. When a pattern contains +recursive subroutine calls it is not always possible to determine whether or +not it can match an empty string. PCRE2 takes a cautious approach and returns 1 +in such cases. +
+ PCRE2_INFO_MATCHLIMIT ++If the pattern set a match limit by including an item of the form +(*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument +should point to a uint32_t integer. If no such value has been set, the call to +pcre2_pattern_info() returns the error PCRE2_ERROR_UNSET. Note that this +limit will only be used during matching if it is less than the limit set or +defaulted by the caller of the match function. +
+ PCRE2_INFO_MAXLOOKBEHIND ++A lookbehind assertion moves back a certain number of characters (not code +units) when it starts to process each of its branches. This request returns the +largest of these backward moves. The third argument should point to a uint32_t +integer. The simple assertions \b and \B require a one-character lookbehind +and cause PCRE2_INFO_MAXLOOKBEHIND to return 1 in the absence of anything +longer. \A also registers a one-character lookbehind, though it does not +actually inspect the previous character. + +
+Note that this information is useful for multi-segment matching only +if the pattern contains no nested lookbehinds. For example, the pattern +(?<=a(?<=ba)c) returns a maximum lookbehind of 2, but when it is processed, the +first lookbehind moves back by two characters, matches one character, then the +nested lookbehind also moves back by two characters. This puts the matching +point three characters earlier than it was at the start. +PCRE2_INFO_MAXLOOKBEHIND is really only useful as a debugging tool. See the +pcre2partial +documentation for a discussion of multi-segment matching. +
+ PCRE2_INFO_MINLENGTH ++If a minimum length for matching subject strings was computed, its value is +returned. Otherwise the returned value is 0. This value is not computed when +PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in +UTF mode may be different from the number of code units. The third argument +should point to a uint32_t variable. The value is a lower bound to the +length of any matching string. There may not be any strings of that length that +do actually match, but every string that does match is at least that long. +
+ PCRE2_INFO_NAMECOUNT + PCRE2_INFO_NAMEENTRYSIZE + PCRE2_INFO_NAMETABLE ++PCRE2 supports the use of named as well as numbered capturing parentheses. The +names are just an additional way of identifying the parentheses, which still +acquire numbers. Several convenience functions such as +pcre2_substring_get_byname() are provided for extracting captured +substrings by name. It is also possible to extract the data directly, by first +converting the name to a number in order to access the correct pointers in the +output vector (described with pcre2_match() below). To do the conversion, +you need to use the name-to-number map, which is described by these three +values. + +
+The map consists of a number of fixed-size entries. PCRE2_INFO_NAMECOUNT gives +the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives the size of each +entry in code units; both of these return a uint32_t value. The entry +size depends on the length of the longest name. +
++PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table. This is +a PCRE2_SPTR pointer to a block of code units. In the 8-bit library, the first +two bytes of each entry are the number of the capturing parenthesis, most +significant byte first. In the 16-bit library, the pointer points to 16-bit +code units, the first of which contains the parenthesis number. In the 32-bit +library, the pointer points to 32-bit code units, the first of which contains +the parenthesis number. The rest of the entry is the corresponding name, zero +terminated. +
++The names are in alphabetical order. If (?| is used to create multiple capture +groups with the same number, as described in the +section on duplicate group numbers +in the +pcre2pattern +page, the groups may be given the same name, but there is only one entry in the +table. Different names for groups of the same number are not permitted. +
++Duplicate names for capture groups with different numbers are permitted, but +only if PCRE2_DUPNAMES is set. They appear in the table in the order in which +they were found in the pattern. In the absence of (?| this is the order of +increasing number; when (?| is used this is not necessarily the case because +later capture groups may have lower numbers. +
++As a simple example of the name/number table, consider the following pattern +after compilation by the 8-bit library (assume PCRE2_EXTENDED is set, so white +space - including newlines - is ignored): +
+ (?<date> (?<year>(\d\d)?\d\d) - (?<month>\d\d) - (?<day>\d\d) ) ++There are four named capture groups, so the table has four entries, and each +entry in the table is eight bytes long. The table is as follows, with +non-printing bytes shows in hexadecimal, and undefined bytes shown as ??: +
+ 00 01 d a t e 00 ?? + 00 05 d a y 00 ?? ?? + 00 04 m o n t h 00 + 00 02 y e a r 00 ?? ++When writing code to extract data from named capture groups using the +name-to-number map, remember that the length of the entries is likely to be +different for each compiled pattern. +
+ PCRE2_INFO_NEWLINE ++The output is one of the following uint32_t values: +
+ PCRE2_NEWLINE_CR Carriage return (CR) + PCRE2_NEWLINE_LF Linefeed (LF) + PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) + PCRE2_NEWLINE_ANY Any Unicode line ending + PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF + PCRE2_NEWLINE_NUL The NUL character (binary zero) ++This identifies the character sequence that will be recognized as meaning +"newline" while matching. +
+ PCRE2_INFO_SIZE ++Return the size of the compiled pattern in bytes (for all three libraries). The +third argument should point to a size_t variable. This value includes the +size of the general data block that precedes the code units of the compiled +pattern itself. The value that is used when pcre2_compile() is getting +memory in which to place the compiled pattern may be slightly larger than the +value returned by this option, because there are cases where the code that +calculates the size has to over-estimate. Processing a pattern with the JIT +compiler does not alter the value returned by this option. + +
+int pcre2_callout_enumerate(const pcre2_code *code,
+ int (*callback)(pcre2_callout_enumerate_block *, void *),
+ void *user_data);
+
+
+A script language that supports the use of string arguments in callouts might
+like to scan all the callouts in a pattern before running the match. This can
+be done by calling pcre2_callout_enumerate(). The first argument is a
+pointer to a compiled pattern, the second points to a callback function, and
+the third is arbitrary user data. The callback function is called for every
+callout in the pattern in the order in which they appear. Its first argument is
+a pointer to a callout enumeration block, and its second argument is the
+user_data value that was passed to pcre2_callout_enumerate(). The
+contents of the callout enumeration block are described in the
+pcre2callout
+documentation, which also gives further details about callouts.
+
+It is possible to save compiled patterns on disc or elsewhere, and reload them +later, subject to a number of restrictions. The host on which the patterns are +reloaded must be running the same version of PCRE2, with the same code unit +width, and must also have the same endianness, pointer width, and PCRE2_SIZE +type. Before compiled patterns can be saved, they must be converted to a +"serialized" form, which in the case of PCRE2 is really just a bytecode dump. +The functions whose names begin with pcre2_serialize_ are used for +converting to and from the serialized form. They are described in the +pcre2serialize +documentation. Note that PCRE2 serialization does not convert compiled patterns +to an abstract format like Java or .NET serialization. +
+
+pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
+ pcre2_general_context *gcontext);
+
+
+pcre2_match_data *pcre2_match_data_create_from_pattern(
+ const pcre2_code *code, pcre2_general_context *gcontext);
+
+
+void pcre2_match_data_free(pcre2_match_data *match_data);
+
+Information about a successful or unsuccessful match is placed in a match +data block, which is an opaque structure that is accessed by function calls. In +particular, the match data block contains a vector of offsets into the subject +string that define the matched parts of the subject. This is known as the +ovector. +
++Before calling pcre2_match(), pcre2_dfa_match(), or +pcre2_jit_match() you must create a match data block by calling one of +the creation functions above. For pcre2_match_data_create(), the first +argument is the number of pairs of offsets in the ovector. +
++When using pcre2_match(), one pair of offsets is required to identify the +string that matched the whole pattern, with an additional pair for each +captured substring. For example, a value of 4 creates enough space to record +the matched portion of the subject plus three captured substrings. +
++When using pcre2_dfa_match() there may be multiple matched substrings of +different lengths at the same point in the subject. The ovector should be made +large enough to hold as many as are expected. +
++A minimum of at least 1 pair is imposed by pcre2_match_data_create(), so +it is always possible to return the overall matched string in the case of +pcre2_match() or the longest match in the case of +pcre2_dfa_match(). The maximum number of pairs is 65535; if the first +argument of pcre2_match_data_create() is greater than this, 65535 is +used. +
++The second argument of pcre2_match_data_create() is a pointer to a +general context, which can specify custom memory management for obtaining the +memory for the match data block. If you are not using custom memory management, +pass NULL, which causes malloc() to be used. +
++For pcre2_match_data_create_from_pattern(), the first argument is a +pointer to a compiled pattern. The ovector is created to be exactly the right +size to hold all the substrings a pattern might capture when matched using +pcre2_match(). You should not use this call when matching with +pcre2_dfa_match(). The second argument is again a pointer to a general +context, but in this case if NULL is passed, the memory is obtained using the +same allocator that was used for the compiled pattern (custom or default). +
++A match data block can be used many times, with the same or different compiled +patterns. You can extract information from a match data block after a match +operation has finished, using functions that are described in the sections on +matched strings +and +other match data +below. +
++When a call of pcre2_match() fails, valid data is available in the match +block only when the error is PCRE2_ERROR_NOMATCH, PCRE2_ERROR_PARTIAL, or one +of the error codes for an invalid UTF string. Exactly what is available depends +on the error, and is detailed below. +
++When one of the matching functions is called, pointers to the compiled pattern +and the subject string are set in the match data block so that they can be +referenced by the extraction functions after a successful match. After running +a match, you must not free a compiled pattern or a subject string until after +all operations on the match data block (for that match) have taken place, +unless, in the case of the subject string, you have used the +PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled +"Option bits for pcre2_match()" +below. +
++When a match data block itself is no longer needed, it should be freed by +calling pcre2_match_data_free(). If this function is called with a NULL +argument, it returns immediately, without doing anything. +
+
+PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE pcre2_get_match_data_heapframes_size(
+ pcre2_match_data *match_data);
+
+The size of a match data block depends on the size of the ovector that it +contains. The function pcre2_get_match_data_size() returns the size, in +bytes, of the block that is its argument. +
++When pcre2_match() runs interpretively (that is, without using JIT), it +makes use of a vector of data frames for remembering backtracking positions. +The size of each individual frame depends on the number of capturing +parentheses in the pattern and can be obtained by calling +pcre2_pattern_info() with the PCRE2_INFO_FRAMESIZE option (see the +section entitled "Information about a compiled pattern" +above). +
++Heap memory is used for the frames vector; if the initial memory block turns +out to be too small during matching, it is automatically expanded. When +pcre2_match() returns, the memory is not freed, but remains attached to +the match data block, for use by any subsequent matches that use the same +block. It is automatically freed when the match data block itself is freed. +
++You can find the current size of the frames vector that a match data block owns +by calling pcre2_get_match_data_heapframes_size(). For a newly created +match data block the size will be zero. Some types of match may require a lot +of frames and thus a large vector; applications that run in environments where +memory is constrained can check this and free the match data block if the heap +frames vector has become too big. +
++int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext); +
++The function pcre2_match() is called to match a subject string against a +compiled pattern, which is passed in the code argument. You can call +pcre2_match() with the same code argument as many times as you +like, in order to find multiple matches in the subject string or to match +different subject strings with the same pattern. +
++This function is the main matching facility of the library, and it operates in +a Perl-like manner. For specialist use there is also an alternative matching +function, which is described +below +in the section about the pcre2_dfa_match() function. +
++Here is an example of a simple call to pcre2_match(): +
+ pcre2_match_data *md = pcre2_match_data_create(4, NULL); + int rc = pcre2_match( + re, /* result of pcre2_compile() */ + "some string", /* the subject string */ + 11, /* the length of the subject string */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + md, /* the match data block */ + NULL); /* a match context; NULL means use defaults */ ++If the subject string is zero-terminated, the length can be given as +PCRE2_ZERO_TERMINATED. A match context must be provided if certain less common +matching parameters are to be changed. For details, see the section on +the match context +above. + +
+The subject string is passed to pcre2_match() as a pointer in +subject, a length in length, and a starting offset in +startoffset. The length and offset are in code units, not characters. +That is, they are in bytes for the 8-bit library, 16-bit code units for the +16-bit library, and 32-bit code units for the 32-bit library, whether or not +UTF processing is enabled. As a special case, if subject is NULL and +length is zero, the subject is assumed to be an empty string. If +length is non-zero, an error occurs if subject is NULL. +
++If startoffset is greater than the length of the subject, +pcre2_match() returns PCRE2_ERROR_BADOFFSET. When the starting offset is +zero, the search for a match starts at the beginning of the subject, and this +is by far the most common case. In UTF-8 or UTF-16 mode, the starting offset +must point to the start of a character, or to the end of the subject (in UTF-32 +mode, one code unit equals one character, so all offsets are valid). Like the +pattern string, the subject may contain binary zeros. +
++A non-zero starting offset is useful when searching for another match in the +same subject by calling pcre2_match() again after a previous success. +Setting startoffset differs from passing over a shortened string and +setting PCRE2_NOTBOL in the case of a pattern that begins with any kind of +lookbehind. For example, consider the pattern +
+ \Biss\B ++which finds occurrences of "iss" in the middle of words. (\B matches only if +the current position in the subject is not a word boundary.) When applied to +the string "Mississippi" the first call to pcre2_match() finds the first +occurrence. If pcre2_match() is called again with just the remainder of +the subject, namely "issippi", it does not match, because \B is always false +at the start of the subject, which is deemed to be a word boundary. However, if +pcre2_match() is passed the entire string again, but with +startoffset set to 4, it finds the second occurrence of "iss" because it +is able to look behind the starting point to discover that it is preceded by a +letter. + +
+Finding all the matches in a subject is tricky when the pattern can match an +empty string. It is possible to emulate Perl's /g behaviour by first trying the +match again at the same offset, with the PCRE2_NOTEMPTY_ATSTART and +PCRE2_ANCHORED options, and then if that fails, advancing the starting offset +and trying an ordinary match again. There is some code that demonstrates how to +do this in the +pcre2demo +sample program. In the most general case, you have to check to see if the +newline convention recognizes CRLF as a newline, and if so, and the current +character is CR followed by LF, advance the starting offset by two characters +instead of one. +
++If a non-zero starting offset is passed when the pattern is anchored, a single +attempt to match at the given offset is made. This can only succeed if the +pattern does not require the match to be at the start of the subject. In other +words, the anchoring must be the result of setting the PCRE2_ANCHORED option or +the use of .* with PCRE2_DOTALL, not by starting the pattern with ^ or \A. +
++The unused bits of the options argument for pcre2_match() must be +zero. The only bits that may be set are PCRE2_ANCHORED, +PCRE2_COPY_MATCHED_SUBJECT, PCRE2_DISABLE_RECURSELOOP_CHECK, PCRE2_ENDANCHORED, +PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, +PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. +Their action is described below. +
++Setting PCRE2_ANCHORED or PCRE2_ENDANCHORED at match time is not supported by +the just-in-time (JIT) compiler. If it is set, JIT matching is disabled and the +interpretive code in pcre2_match() is run. +PCRE2_DISABLE_RECURSELOOP_CHECK is ignored by JIT, but apart from PCRE2_NO_JIT +(obviously), the remaining options are supported for JIT matching. +
+ PCRE2_ANCHORED ++The PCRE2_ANCHORED option limits pcre2_match() to matching at the first +matching position. If a pattern was compiled with PCRE2_ANCHORED, or turned out +to be anchored by virtue of its contents, it cannot be made unachored at +matching time. Note that setting the option at match time disables JIT +matching. +
+ PCRE2_COPY_MATCHED_SUBJECT ++By default, a pointer to the subject is remembered in the match data block so +that, after a successful match, it can be referenced by the substring +extraction functions. This means that the subject's memory must not be freed +until all such operations are complete. For some applications where the +lifetime of the subject string is not guaranteed, it may be necessary to make a +copy of the subject string, but it is wasteful to do this unless the match is +successful. After a successful match, if PCRE2_COPY_MATCHED_SUBJECT is set, the +subject is copied and the new pointer is remembered in the match data block +instead of the original subject pointer. The memory allocator that was used for +the match block itself is used. The copy is automatically freed when +pcre2_match_data_free() is called to free the match data block. It is also +automatically freed if the match data block is re-used for another match +operation. +
+ PCRE2_DISABLE_RECURSELOOP_CHECK ++This option is relevant only to pcre2_match() for interpretive matching. +It is ignored when JIT is used, and is forbidden for pcre2_dfa_match(). + +
+The use of recursion in patterns can lead to infinite loops. In the +interpretive matcher these would be eventually caught by the match or heap +limits, but this could take a long time and/or use a lot of memory if the +limits are large. There is therefore a check at the start of each recursion. +If the same group is still active from a previous call, and the current subject +pointer is the same as it was at the start of that group, and the furthest +inspected character of the subject has not changed, an error is generated. +
++There are rare cases of matches that would complete, but nevertheless trigger +this error. This option disables the check. It is provided mainly for testing +when comparing JIT and interpretive behaviour. +
+ PCRE2_ENDANCHORED ++If the PCRE2_ENDANCHORED option is set, any string that pcre2_match() +matches must be right at the end of the subject string. Note that setting the +option at match time disables JIT matching. +
+ PCRE2_NOTBOL ++This option specifies that first character of the subject string is not the +beginning of a line, so the circumflex metacharacter should not match before +it. Setting this without having set PCRE2_MULTILINE at compile time causes +circumflex never to match. This option affects only the behaviour of the +circumflex metacharacter. It does not affect \A. +
+ PCRE2_NOTEOL ++This option specifies that the end of the subject string is not the end of a +line, so the dollar metacharacter should not match it nor (except in multiline +mode) a newline immediately before it. Setting this without having set +PCRE2_MULTILINE at compile time causes dollar never to match. This option +affects only the behaviour of the dollar metacharacter. It does not affect \Z +or \z. +
+ PCRE2_NOTEMPTY ++An empty string is not considered to be a valid match if this option is set. If +there are alternatives in the pattern, they are tried. If all the alternatives +match the empty string, the entire match fails. For example, if the pattern +
+ a?b? ++is applied to a string not beginning with "a" or "b", it matches an empty +string at the start of the subject. With PCRE2_NOTEMPTY set, this match is not +valid, so pcre2_match() searches further into the string for occurrences +of "a" or "b". +
+ PCRE2_NOTEMPTY_ATSTART ++This is like PCRE2_NOTEMPTY, except that it locks out an empty string match +only at the first matching position, that is, at the start of the subject plus +the starting offset. An empty string match later in the subject is permitted. +If the pattern is anchored, such a match can occur only if the pattern contains +\K. +
+ PCRE2_NO_JIT ++By default, if a pattern has been successfully processed by +pcre2_jit_compile(), JIT is automatically used when pcre2_match() +is called with options that JIT supports. Setting PCRE2_NO_JIT disables the use +of JIT; it forces matching to be done by the interpreter. +
+ PCRE2_NO_UTF_CHECK ++When PCRE2_UTF is set at compile time, the validity of the subject as a UTF +string is checked unless PCRE2_NO_UTF_CHECK is passed to pcre2_match() or +PCRE2_MATCH_INVALID_UTF was passed to pcre2_compile(). The latter special +case is discussed in detail in the +pcre2unicode +documentation. + +
+In the default case, if a non-zero starting offset is given, the check is +applied only to that part of the subject that could be inspected during +matching, and there is a check that the starting offset points to the first +code unit of a character or to the end of the subject. If there are no +lookbehind assertions in the pattern, the check starts at the starting offset. +Otherwise, it starts at the length of the longest lookbehind before the +starting offset, or at the start of the subject if there are not that many +characters before the starting offset. Note that the sequences \b and \B are +one-character lookbehinds. +
++The check is carried out before any other processing takes place, and a +negative error code is returned if the check fails. There are several UTF error +codes for each code unit width, corresponding to different problems with the +code unit sequence. There are discussions about the validity of +UTF-8 strings, +UTF-16 strings, +and +UTF-32 strings +in the +pcre2unicode +documentation. +
++If you know that your subject is valid, and you want to skip this check for +performance reasons, you can set the PCRE2_NO_UTF_CHECK option when calling +pcre2_match(). You might want to do this for the second and subsequent +calls to pcre2_match() if you are making repeated calls to find multiple +matches in the same subject string. +
++Warning: Unless PCRE2_MATCH_INVALID_UTF was set at compile time, when +PCRE2_NO_UTF_CHECK is set at match time the effect of passing an invalid +string as a subject, or an invalid value of startoffset, is undefined. +Your program may crash or loop indefinitely or give wrong results. +
+ PCRE2_PARTIAL_HARD + PCRE2_PARTIAL_SOFT ++These options turn on the partial matching feature. A partial match occurs if +the end of the subject string is reached successfully, but there are not enough +subject characters to complete the match. In addition, either at least one +character must have been inspected or the pattern must contain a lookbehind, or +the pattern must be one that could match an empty string. + +
+If this situation arises when PCRE2_PARTIAL_SOFT (but not PCRE2_PARTIAL_HARD) +is set, matching continues by testing any remaining alternatives. Only if no +complete match can be found is PCRE2_ERROR_PARTIAL returned instead of +PCRE2_ERROR_NOMATCH. In other words, PCRE2_PARTIAL_SOFT specifies that the +caller is prepared to handle a partial match, but only if no complete match can +be found. +
++If PCRE2_PARTIAL_HARD is set, it overrides PCRE2_PARTIAL_SOFT. In this case, if +a partial match is found, pcre2_match() immediately returns +PCRE2_ERROR_PARTIAL, without considering any other alternatives. In other +words, when PCRE2_PARTIAL_HARD is set, a partial match is considered to be more +important that an alternative complete match. +
++There is a more detailed discussion of partial and multi-segment matching, with +examples, in the +pcre2partial +documentation. +
++When PCRE2 is built, a default newline convention is set; this is usually the +standard convention for the operating system. The default can be overridden in +a +compile context +by calling pcre2_set_newline(). It can also be overridden by starting a +pattern string with, for example, (*CRLF), as described in the +section on newline conventions +in the +pcre2pattern +page. During matching, the newline choice affects the behaviour of the dot, +circumflex, and dollar metacharacters. It may also alter the way the match +starting position is advanced after a match failure for an unanchored pattern. +
++When PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, or PCRE2_NEWLINE_ANY is set as +the newline convention, and a match attempt for an unanchored pattern fails +when the current starting position is at a CRLF sequence, and the pattern +contains no explicit matches for CR or LF characters, the match position is +advanced by two characters instead of one, in other words, to after the CRLF. +
++The above rule is a compromise that makes the most common cases work as +expected. For example, if the pattern is .+A (and the PCRE2_DOTALL option is +not set), it does not match the string "\r\nA" because, after failing at the +start, it skips both the CR and the LF before retrying. However, the pattern +[\r\n]A does match that string, because it contains an explicit CR or LF +reference, and so advances only by one character after the first failure. +
++An explicit match for CR of LF is either a literal appearance of one of those +characters in the pattern, or one of the \r or \n or equivalent octal or +hexadecimal escape sequences. Implicit matches such as [^X] do not count, nor +does \s, even though it includes CR and LF in the characters that it matches. +
++Notwithstanding the above, anomalous effects may still occur when CRLF is a +valid newline sequence and explicit \r or \n escapes appear in the pattern. +
+
+uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
+
+In general, a pattern matches a certain portion of the subject, and in +addition, further substrings from the subject may be picked out by +parenthesized parts of the pattern. Following the usage in Jeffrey Friedl's +book, this is called "capturing" in what follows, and the phrase "capture +group" (Perl terminology) is used for a fragment of a pattern that picks out a +substring. PCRE2 supports several other kinds of parenthesized group that do +not cause substrings to be captured. The pcre2_pattern_info() function +can be used to find out how many capture groups there are in a compiled +pattern. +
++You can use auxiliary functions for accessing captured substrings +by number +or +by name, +as described in sections below. +
++Alternatively, you can make direct use of the vector of PCRE2_SIZE values, +called the ovector, which contains the offsets of captured strings. It is +part of the +match data block. +The function pcre2_get_ovector_pointer() returns the address of the +ovector, and pcre2_get_ovector_count() returns the number of pairs of +values it contains. +
++Within the ovector, the first in each pair of values is set to the offset of +the first code unit of a substring, and the second is set to the offset of the +first code unit after the end of a substring. These values are always code unit +offsets, not character offsets. That is, they are byte offsets in the 8-bit +library, 16-bit offsets in the 16-bit library, and 32-bit offsets in the 32-bit +library. +
++After a partial match (error return PCRE2_ERROR_PARTIAL), only the first pair +of offsets (that is, ovector[0] and ovector[1]) are set. They +identify the part of the subject that was partially matched. See the +pcre2partial +documentation for details of partial matching. +
++After a fully successful match, the first pair of offsets identifies the +portion of the subject string that was matched by the entire pattern. The next +pair is used for the first captured substring, and so on. The value returned by +pcre2_match() is one more than the highest numbered pair that has been +set. For example, if two substrings have been captured, the returned value is +3. If there are no captured substrings, the return value from a successful +match is 1, indicating that just the first pair of offsets has been set. +
++If a pattern uses the \K escape sequence within a positive assertion, the +reported start of a successful match can be greater than the end of the match. +For example, if the pattern (?=ab\K) is matched against "ab", the start and +end offset values for the match are 2 and 0. +
++If a capture group is matched repeatedly within a single match operation, it is +the last portion of the subject that it matched that is returned. +
++If the ovector is too small to hold all the captured substring offsets, as much +as possible is filled in, and the function returns a value of zero. If captured +substrings are not of interest, pcre2_match() may be called with a match +data block whose ovector is of minimum length (that is, one pair). +
++It is possible for capture group number n+1 to match some part of the +subject when group n has not been used at all. For example, if the string +"abc" is matched against the pattern (a|(z))(bc) the return from the function +is 4, and groups 1 and 3 are matched, but 2 is not. When this happens, both +values in the offset pairs corresponding to unused groups are set to +PCRE2_UNSET. +
++Offset values that correspond to unused groups at the end of the expression are +also set to PCRE2_UNSET. For example, if the string "abc" is matched against +the pattern (abc)(x(yz)?)? groups 2 and 3 are not matched. The return from the +function is 2, because the highest used capture group number is 1. The offsets +for the second and third capture groups (assuming the vector is large enough, +of course) are set to PCRE2_UNSET. +
++Elements in the ovector that do not correspond to capturing parentheses in the +pattern are never changed. That is, if a pattern contains n capturing +parentheses, no more than ovector[0] to ovector[2n+1] are set by +pcre2_match(). The other elements retain whatever values they previously +had. After a failed match attempt, the contents of the ovector are unchanged. +
+
+PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
+
+
+PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
+
+As well as the offsets in the ovector, other information about a match is +retained in the match data block and can be retrieved by the above functions in +appropriate circumstances. If they are called at other times, the result is +undefined. +
++After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure +to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function +pcre2_get_mark() can be called to access this name, which can be +specified in the pattern by any of the backtracking control verbs, not just +(*MARK). The same function applies to all the verbs. It returns a pointer to +the zero-terminated name, which is within the compiled pattern. If no name is +available, NULL is returned. The length of the name (excluding the terminating +zero) is stored in the code unit that precedes the name. You should use this +length instead of relying on the terminating zero if the name might contain a +binary zero. +
++After a successful match, the name that is returned is the last mark name +encountered on the matching path through the pattern. Instances of backtracking +verbs without names do not count. Thus, for example, if the matching path +contains (*MARK:A)(*PRUNE), the name "A" is returned. After a "no match" or a +partial match, the last encountered name is returned. For example, consider +this pattern: +
+ ^(*MARK:A)((*MARK:B)a|b)c ++When it matches "bc", the returned name is A. The B mark is "seen" in the first +branch of the group, but it is not on the matching path. On the other hand, +when this pattern fails to match "bx", the returned name is B. + +
+Warning: By default, certain start-of-match optimizations are used to +give a fast "no match" result in some situations. For example, if the anchoring +is removed from the pattern above, there is an initial check for the presence +of "c" in the subject before running the matching engine. This check fails for +"bx", causing a match failure without seeing any marks. You can disable the +start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option for +pcre2_compile() or by starting the pattern with (*NO_START_OPT). +
++After a successful match, a partial match, or one of the invalid UTF errors +(for example, PCRE2_ERROR_UTF8_ERR5), pcre2_get_startchar() can be +called. After a successful or partial match it returns the code unit offset of +the character at which the match started. For a non-partial match, this can be +different to the value of ovector[0] if the pattern contains the \K +escape sequence. After a partial match, however, this value is always the same +as ovector[0] because \K does not affect the result of a partial match. +
++After a UTF check failure, pcre2_get_startchar() can be used to obtain +the code unit offset of the invalid UTF character. Details are given in the +pcre2unicode +page. +
++If pcre2_match() fails, it returns a negative number. This can be +converted to a text string by calling the pcre2_get_error_message() +function (see "Obtaining a textual error message" +below). +Negative error codes are also returned by other functions, and are documented +with them. The codes are given names in the header file. If UTF checking is in +force and an invalid UTF subject string is detected, one of a number of +UTF-specific negative error codes is returned. Details are given in the +pcre2unicode +page. The following are the other errors that may be returned by +pcre2_match(): +
+ PCRE2_ERROR_NOMATCH ++The subject string did not match the pattern. +
+ PCRE2_ERROR_PARTIAL ++The subject string did not match, but it did match partially. See the +pcre2partial +documentation for details of partial matching. +
+ PCRE2_ERROR_BADMAGIC ++PCRE2 stores a 4-byte "magic number" at the start of the compiled code, to +catch the case when it is passed a junk pointer. This is the error that is +returned when the magic number is not present. +
+ PCRE2_ERROR_BADMODE ++This error is given when a compiled pattern is passed to a function in a +library of a different code unit width, for example, a pattern compiled by +the 8-bit library is passed to a 16-bit or 32-bit library function. +
+ PCRE2_ERROR_BADOFFSET ++The value of startoffset was greater than the length of the subject. +
+ PCRE2_ERROR_BADOPTION ++An unrecognized bit was set in the options argument. +
+ PCRE2_ERROR_BADUTFOFFSET ++The UTF code unit sequence that was passed as a subject was checked and found +to be valid (the PCRE2_NO_UTF_CHECK option was not set), but the value of +startoffset did not point to the beginning of a UTF character or the end +of the subject. +
+ PCRE2_ERROR_CALLOUT ++This error is never generated by pcre2_match() itself. It is provided for +use by callout functions that want to cause pcre2_match() or +pcre2_callout_enumerate() to return a distinctive error code. See the +pcre2callout +documentation for details. +
+ PCRE2_ERROR_DEPTHLIMIT ++The nested backtracking depth limit was reached. +
+ PCRE2_ERROR_HEAPLIMIT ++The heap limit was reached. +
+ PCRE2_ERROR_INTERNAL ++An unexpected internal error has occurred. This error could be caused by a bug +in PCRE2 or by overwriting of the compiled pattern. +
+ PCRE2_ERROR_JIT_STACKLIMIT ++This error is returned when a pattern that was successfully studied using JIT +is being matched, but the memory available for the just-in-time processing +stack is not large enough. See the +pcre2jit +documentation for more details. +
+ PCRE2_ERROR_MATCHLIMIT ++The backtracking match limit was reached. +
+ PCRE2_ERROR_NOMEMORY ++Heap memory is used to remember backtracking points. This error is given when +the memory allocation function (default or custom) fails. Note that a different +error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds +the heap limit. PCRE2_ERROR_NOMEMORY is also returned if +PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. +
+ PCRE2_ERROR_NULL ++Either the code, subject, or match_data argument was passed +as NULL. +
+ PCRE2_ERROR_RECURSELOOP ++This error is returned when pcre2_match() detects a recursion loop within +the pattern. Specifically, it means that either the whole pattern or a +capture group has been called recursively for the second time at the same +position in the subject string. Some simple patterns that might do this are +detected and faulted at compile time, but more complicated cases, in particular +mutual recursions between two different groups, cannot be detected until +matching is attempted. + +
+int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, + PCRE2_SIZE bufflen); +
++A text message for an error code from any PCRE2 function (compile, match, or +auxiliary) can be obtained by calling pcre2_get_error_message(). The code +is passed as the first argument, with the remaining two arguments specifying a +code unit buffer and its length in code units, into which the text message is +placed. The message is returned in code units of the appropriate width for the +library that is being used. +
++The returned message is terminated with a trailing zero, and the function +returns the number of code units used, excluding the trailing zero. If the +error number is unknown, the negative error code PCRE2_ERROR_BADDATA is +returned. If the buffer is too small, the message is truncated (but still with +a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. +None of the messages are very long; a buffer size of 120 code units is ample. +
+
+int pcre2_substring_length_bynumber(pcre2_match_data *match_data,
+ uint32_t number, PCRE2_SIZE *length);
+
+
+int pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
+ uint32_t number, PCRE2_UCHAR *buffer,
+ PCRE2_SIZE *bufflen);
+
+
+int pcre2_substring_get_bynumber(pcre2_match_data *match_data,
+ uint32_t number, PCRE2_UCHAR **bufferptr,
+ PCRE2_SIZE *bufflen);
+
+
+void pcre2_substring_free(PCRE2_UCHAR *buffer);
+
+Captured substrings can be accessed directly by using the ovector as described +above. +For convenience, auxiliary functions are provided for extracting captured +substrings as new, separate, zero-terminated strings. A substring that contains +a binary zero is correctly extracted and has a further zero added on the end, +but the result is not, of course, a C string. +
++The functions in this section identify substrings by number. The number zero +refers to the entire matched substring, with higher numbers referring to +substrings captured by parenthesized groups. After a partial match, only +substring zero is available. An attempt to extract any other substring gives +the error PCRE2_ERROR_PARTIAL. The next section describes similar functions for +extracting captured substrings by name. +
++If a pattern uses the \K escape sequence within a positive assertion, the +reported start of a successful match can be greater than the end of the match. +For example, if the pattern (?=ab\K) is matched against "ab", the start and +end offset values for the match are 2 and 0. In this situation, calling these +functions with a zero substring number extracts a zero-length empty string. +
++You can find the length in code units of a captured substring without +extracting it by calling pcre2_substring_length_bynumber(). The first +argument is a pointer to the match data block, the second is the group number, +and the third is a pointer to a variable into which the length is placed. If +you just want to know whether or not the substring has been captured, you can +pass the third argument as NULL. +
++The pcre2_substring_copy_bynumber() function copies a captured substring +into a supplied buffer, whereas pcre2_substring_get_bynumber() copies it +into new memory, obtained using the same memory allocation function that was +used for the match data block. The first two arguments of these functions are a +pointer to the match data block and a capture group number. +
++The final arguments of pcre2_substring_copy_bynumber() are a pointer to +the buffer and a pointer to a variable that contains its length in code units. +This is updated to contain the actual number of code units used for the +extracted substring, excluding the terminating zero. +
++For pcre2_substring_get_bynumber() the third and fourth arguments point +to variables that are updated with a pointer to the new memory and the number +of code units that comprise the substring, again excluding the terminating +zero. When the substring is no longer needed, the memory should be freed by +calling pcre2_substring_free(). +
++The return value from all these functions is zero for success, or a negative +error code. If the pattern match failed, the match failure code is returned. +If a substring number greater than zero is used after a partial match, +PCRE2_ERROR_PARTIAL is returned. Other possible error codes are: +
+ PCRE2_ERROR_NOMEMORY ++The buffer was too small for pcre2_substring_copy_bynumber(), or the +attempt to get memory failed for pcre2_substring_get_bynumber(). +
+ PCRE2_ERROR_NOSUBSTRING ++There is no substring with that number in the pattern, that is, the number is +greater than the number of capturing parentheses. +
+ PCRE2_ERROR_UNAVAILABLE ++The substring number, though not greater than the number of captures in the +pattern, is greater than the number of slots in the ovector, so the substring +could not be captured. +
+ PCRE2_ERROR_UNSET ++The substring did not participate in the match. For example, if the pattern is +(abc)|(def) and the subject is "def", and the ovector contains at least two +capturing slots, substring number 1 is unset. + +
+int pcre2_substring_list_get(pcre2_match_data *match_data,
+" PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr);
+
+
+void pcre2_substring_list_free(PCRE2_UCHAR **list);
+
+The pcre2_substring_list_get() function extracts all available substrings +and builds a list of pointers to them. It also (optionally) builds a second +list that contains their lengths (in code units), excluding a terminating zero +that is added to each of them. All this is done in a single block of memory +that is obtained using the same memory allocation function that was used to get +the match data block. +
++This function must be called only after a successful match. If called after a +partial match, the error code PCRE2_ERROR_PARTIAL is returned. +
++The address of the memory block is returned via listptr, which is also +the start of the list of string pointers. The end of the list is marked by a +NULL pointer. The address of the list of lengths is returned via +lengthsptr. If your strings do not contain binary zeros and you do not +therefore need the lengths, you may supply NULL as the lengthsptr +argument to disable the creation of a list of lengths. The yield of the +function is zero if all went well, or PCRE2_ERROR_NOMEMORY if the memory block +could not be obtained. When the list is no longer needed, it should be freed by +calling pcre2_substring_list_free(). +
++If this function encounters a substring that is unset, which can happen when +capture group number n+1 matches some part of the subject, but group +n has not been used at all, it returns an empty string. This can be +distinguished from a genuine zero-length substring by inspecting the +appropriate offset in the ovector, which contain PCRE2_UNSET for unset +substrings, or by calling pcre2_substring_length_bynumber(). +
+
+int pcre2_substring_number_from_name(const pcre2_code *code,
+ PCRE2_SPTR name);
+
+
+int pcre2_substring_length_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_SIZE *length);
+
+
+int pcre2_substring_copy_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR *buffer, PCRE2_SIZE *bufflen);
+
+
+int pcre2_substring_get_byname(pcre2_match_data *match_data,
+ PCRE2_SPTR name, PCRE2_UCHAR **bufferptr, PCRE2_SIZE *bufflen);
+
+
+void pcre2_substring_free(PCRE2_UCHAR *buffer);
+
+To extract a substring by name, you first have to find associated number. +For example, for this pattern: +
+ (a+)b(?<xxx>\d+)... ++the number of the capture group called "xxx" is 2. If the name is known to be +unique (PCRE2_DUPNAMES was not set), you can find the number from the name by +calling pcre2_substring_number_from_name(). The first argument is the +compiled pattern, and the second is the name. The yield of the function is the +group number, PCRE2_ERROR_NOSUBSTRING if there is no group with that name, or +PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one group with that name. +Given the number, you can extract the substring directly from the ovector, or +use one of the "bynumber" functions described above. + +
+For convenience, there are also "byname" functions that correspond to the +"bynumber" functions, the only difference being that the second argument is a +name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate +names, these functions scan all the groups with the given name, and return the +captured substring from the first named group that is set. +
++If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is +returned. If all groups with the name have numbers that are greater than the +number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is returned. If there +is at least one group with a slot in the ovector, but no group is found to be +set, PCRE2_ERROR_UNSET is returned. +
++Warning: If the pattern uses the (?| feature to set up multiple +capture groups with the same number, as described in the +section on duplicate group numbers +in the +pcre2pattern +page, you cannot use names to distinguish the different capture groups, because +names are not included in the compiled code. The matching process uses only +numbers. For this reason, the use of different names for groups with the +same number causes an error at compile time. +
++int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, PCRE2_SPTR replacement, + PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer, + PCRE2_SIZE *outlengthptr); +
++This function optionally calls pcre2_match() and then makes a copy of the +subject string in outputbuffer, replacing parts that were matched with +the replacement string, whose length is supplied in rlength, which +can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a +special case, if replacement is NULL and rlength is zero, the +replacement is assumed to be an empty string. If rlength is non-zero, an +error occurs if replacement is NULL. +
++There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just +the replacement string(s). The default action is to perform just one +replacement if the pattern matches, but there is an option that requests +multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below). +
++If successful, pcre2_substitute() returns the number of substitutions +that were carried out. This may be zero if no match was found, and is never +greater than one unless PCRE2_SUBSTITUTE_GLOBAL is set. A negative value is +returned if an error is detected. +
++Matches in which a \K item in a lookahead in the pattern causes the match to +end before it starts are not supported, and give rise to an error return. For +global replacements, matches in which \K in a lookbehind causes the match to +start earlier than the point that was reached in the previous iteration are +also not supported. +
++The first seven arguments of pcre2_substitute() are the same as for +pcre2_match(), except that the partial matching options are not +permitted, and match_data may be passed as NULL, in which case a match +data block is obtained and freed within this function, using memory management +functions from the match context, if provided, or else those that were used to +allocate memory for the compiled code. +
++If match_data is not NULL and PCRE2_SUBSTITUTE_MATCHED is not set, the +provided block is used for all calls to pcre2_match(), and its contents +afterwards are the result of the final call. For global changes, this will +always be a no-match error. The contents of the ovector within the match data +block may or may not have been changed. +
++As well as the usual options for pcre2_match(), a number of additional +options can be set in the options argument of pcre2_substitute(). +One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external +match_data block must be provided, and it must have already been used for +an external call to pcre2_match() with the same pattern and subject +arguments. The data in the match_data block (return code, offset vector) +is then used for the first substitution instead of calling pcre2_match() +from within pcre2_substitute(). This allows an application to check for a +match before choosing to substitute, without having to repeat the match. +
++The contents of the externally supplied match data block are not changed when +PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set, +pcre2_match() is called after the first substitution to check for further +matches, but this is done using an internally obtained match data block, thus +always leaving the external block unchanged. +
++The code argument is not used for matching before the first substitution +when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, even when +PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains information such as the +UTF setting and the number of capturing parentheses in the pattern. +
++The default action of pcre2_substitute() is to return a copy of the +subject string with matched substrings replaced. However, if +PCRE2_SUBSTITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are +returned. In the global case, multiple replacements are concatenated in the +output buffer. Substitution callouts (see +below) +can be used to separate them if necessary. +
++The outlengthptr argument of pcre2_substitute() must point to a +variable that contains the length, in code units, of the output buffer. If the +function is successful, the value is updated to contain the length in code +units of the new string, excluding the trailing zero that is automatically +added. +
++If the function is not successful, the value set via outlengthptr depends +on the type of error. For syntax errors in the replacement string, the value is +the offset in the replacement string where the error was detected. For other +errors, the value is PCRE2_UNSET by default. This includes the case of the +output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set. +
++PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is +too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If +this option is set, however, pcre2_substitute() continues to go through +the motions of matching and substituting (without, of course, writing anything) +in order to compute the size of buffer that is needed. This value is passed +back via the outlengthptr variable, with the result of the function still +being PCRE2_ERROR_NOMEMORY. +
++Passing a buffer size of zero is a permitted way of finding out how much memory +is needed for given substitution. However, this does mean that the entire +operation is carried out twice. Depending on the application, it may be more +efficient to allocate a large buffer and free the excess afterwards, instead of +using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH. +
++The replacement string, which is interpreted as a UTF string in UTF mode, is +checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF +replacement string causes an immediate return with the relevant UTF error code. +
++If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted +in any way. By default, however, a dollar character is an escape character that +can specify the insertion of characters from capture groups and names from +(*MARK) or other control verbs in the pattern. Dollar is the only escape +character (backslash is treated as literal). The following forms are always +recognized: +
+ $$ insert a dollar character
+ $<n> or ${<n>} insert the contents of group <n>
+ $*MARK or ${*MARK} insert a control verb name
+
+Either a group number or a group name can be given for <n>. Curly brackets are
+required only if the following character would be interpreted as part of the
+number or name. The number may be zero to include the entire matched string.
+For example, if the pattern a(b)c is matched with "=abc=" and the replacement
+string "+$1$0$1+", the result is "=+babcb+=".
+
++$*MARK inserts the name from the last encountered backtracking control verb on +the matching path that has a name. (*MARK) must always include a name, but the +other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name +inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B". This +facility can be used to perform simple simultaneous substitutions, as this +pcre2test example shows: +
+ /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
+ apple lemon
+ 2: pear orange
+
+PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string,
+replacing every matching substring. If this option is not set, only the first
+matching substring is replaced. The search for matches takes place in the
+original subject string (that is, previous replacements do not affect it).
+Iteration is implemented by advancing the startoffset value for each
+search, which is always passed the entire subject string. If an offset limit is
+set in the match context, searching stops when that limit is reached.
+
++You can restrict the effect of a global substitution to a portion of the +subject string by setting either or both of startoffset and an offset +limit. Here is a pcre2test example: +
+ /B/g,replace=!,use_offset_limit + ABC ABC ABC ABC\=offset=3,offset_limit=12 + 2: ABC A!C A!C ABC ++When continuing with global substitutions after matching a substring with zero +length, an attempt to find a non-empty match at the same offset is performed. +If this is not successful, the offset is advanced by one character except when +CRLF is a valid newline sequence and the next two characters are CR, LF. In +this case, the offset is advanced by two characters. + +
+PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do +not appear in the pattern to be treated as unset groups. This option should be +used with care, because it means that a typo in a group name or number no +longer causes the PCRE2_ERROR_NOSUBSTRING error. +
++PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including unknown +groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty +strings when inserted as described above. If this option is not set, an attempt +to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does +not influence the extended substitution syntax described below. +
++PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the +replacement string. Without this option, only the dollar character is special, +and only the group insertion forms listed above are valid. When +PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +
++Firstly, backslash in a replacement string is interpreted as an escape +character. The usual forms such as \n or \x{ddd} can be used to specify +particular character codes, and backslash followed by any non-alphanumeric +character quotes that character. Extended quoting can be coded using \Q...\E, +exactly as in pattern strings. +
++There are also four escape sequences for forcing the case of inserted letters. +The insertion mechanism has three states: no case forcing, force upper case, +and force lower case. The escape sequences change the current state: \U and +\L change to upper or lower case forcing, respectively, and \E (when not +terminating a \Q quoted sequence) reverts to no case forcing. The sequences +\u and \l force the next character (if it is a letter) to upper or lower +case, respectively, and then the state automatically reverts to no case +forcing. Case forcing applies to all inserted characters, including those from +capture groups and letters within \Q...\E quoted sequences. If either +PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode +properties are used for case forcing characters whose code points are greater +than 127. +
++Note that case forcing sequences such as \U...\E do not nest. For example, +the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no +effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do +not apply to replacement strings. +
++The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +flexibility to capture group substitution. The syntax is similar to that used +by Bash: +
+ ${<n>:-<string>}
+ ${<n>:+<string1>:<string2>}
+
+As before, <n> may be a group number or a name. The first form specifies a
+default value. If group <n> is set, its value is inserted; if not, <string> is
+expanded and the result inserted. The second form specifies strings that are
+expanded and inserted when group <n> is set or unset, respectively. The first
+form is just a convenient shorthand for
+
+ ${<n>:+${<n>}:<string>}
+
+Backslash can be used to escape colons and closing curly brackets in the
+replacement strings. A change of the case forcing state within a replacement
+string remains in force afterwards, as shown in this pcre2test example:
+
+ /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo
+ body
+ 1: hello
+ somebody
+ 1: HELLO
+
+The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
+substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
+groups in the extended syntax forms to be treated as unset.
+
++If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, +PCRE2_SUBSTITUTE_UNSET_EMPTY, and PCRE2_SUBSTITUTE_EXTENDED are irrelevant and +are ignored. +
++In the event of an error, pcre2_substitute() returns a negative error +code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from +pcre2_match() are passed straight back. +
++PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion, +unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. +
++PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an +unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple +(non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set. +
++PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the +PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is +needed is returned via outlengthptr. Note that this does not happen by +default. +
++PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the +match_data argument is NULL or if the subject or replacement +arguments are NULL. For backward compatibility reasons an exception is made for +the replacement argument if the rlength argument is also 0. +
++PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the +replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE +(invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE (closing curly bracket +not found), PCRE2_ERROR_BADSUBSTITUTION (syntax error in extended group +substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before +it started or the match started earlier than the current position in the +subject, which can happen if \K is used in an assertion). +
++As for all PCRE2 errors, a text message that describes the error can be +obtained by calling the pcre2_get_error_message() function (see +"Obtaining a textual error message" +above). +
+
+int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
+ int (*callout_function)(pcre2_substitute_callout_block *, void *),
+ void *callout_data);
+
+
+The pcre2_set_substitution_callout() function can be used to specify a
+callout function for pcre2_substitute(). This information is passed in
+a match context. The callout function is called after each substitution has
+been processed, but it can cause the replacement not to happen. The callout
+function is not called for simulated substitutions that happen as a result of
+the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option.
+
+The first argument of the callout function is a pointer to a substitute callout +block structure, which contains the following fields, not necessarily in this +order: +
+ uint32_t version; + uint32_t subscount; + PCRE2_SPTR input; + PCRE2_SPTR output; + PCRE2_SIZE *ovector; + uint32_t oveccount; + PCRE2_SIZE output_offsets[2]; ++The version field contains the version number of the block format. The +current version is 0. The version number will increase in future if more fields +are added, but the intention is never to remove any of the existing fields. + +
+The subscount field is the number of the current match. It is 1 for the +first callout, 2 for the second, and so on. The input and output +pointers are copies of the values passed to pcre2_substitute(). +
++The ovector field points to the ovector, which contains the result of the +most recent match. The oveccount field contains the number of pairs that +are set in the ovector, and is always greater than zero. +
++The output_offsets vector contains the offsets of the replacement in the +output string. This has already been processed for dollar and (if requested) +backslash substitutions as described above. +
++The second argument of the callout function is the value passed as +callout_data when the function was registered. The value returned by the +callout function is interpreted as follows: +
++If the value is zero, the replacement is accepted, and, if +PCRE2_SUBSTITUTE_GLOBAL is set, processing continues with a search for the next +match. If the value is not zero, the current replacement is not accepted. If +the value is greater than zero, processing continues when +PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero or +PCRE2_SUBSTITUTE_GLOBAL is not set), the rest of the input is copied to the +output and the call to pcre2_substitute() exits, returning the number of +matches so far. +
++int pcre2_substring_nametable_scan(const pcre2_code *code, + PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); +
++When a pattern is compiled with the PCRE2_DUPNAMES option, names for capture +groups are not required to be unique. Duplicate names are always allowed for +groups with the same number, created by using the (?| feature. Indeed, if such +groups are named, they are required to use the same names. +
++Normally, patterns that use duplicate names are such that in any one match, +only one of each set of identically-named groups participates. An example is +shown in the +pcre2pattern +documentation. +
++When duplicates are present, pcre2_substring_copy_byname() and +pcre2_substring_get_byname() return the first substring corresponding to +the given name that is set. Only if none are set is PCRE2_ERROR_UNSET is +returned. The pcre2_substring_number_from_name() function returns the +error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. +
++If you want to get full details of all captured substrings for a given name, +you must use the pcre2_substring_nametable_scan() function. The first +argument is the compiled pattern, and the second is the name. If the third and +fourth arguments are NULL, the function returns a group number for a unique +name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. +
++When the third and fourth arguments are not NULL, they must be pointers to +variables that are updated by the function. After it has run, they point to the +first and last entries in the name-to-number table for the given name, and the +function returns the length of each entry in code units. In both cases, +PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. +
++The format of the name table is described +above +in the section entitled Information about a pattern. Given all the +relevant entries for the name, you can extract each of their numbers, and hence +the captured data. +
++The traditional matching function uses a similar algorithm to Perl, which stops +when it finds the first match at a given point in the subject. If you want to +find all possible matches, or the longest possible match at a given position, +consider using the alternative matching function (see below) instead. If you +cannot use the alternative function, you can kludge it up by making use of the +callout facility, which is described in the +pcre2callout +documentation. +
++What you have to do is to insert a callout right at the end of the pattern. +When your callout function is called, extract and save the current matched +substring. Then return 1, which forces pcre2_match() to backtrack and try +other alternatives. Ultimately, when it runs out of matches, +pcre2_match() will yield PCRE2_ERROR_NOMATCH. +
++int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, + PCRE2_SIZE length, PCRE2_SIZE startoffset, + uint32_t options, pcre2_match_data *match_data, + pcre2_match_context *mcontext, + int *workspace, PCRE2_SIZE wscount); +
++The function pcre2_dfa_match() is called to match a subject string +against a compiled pattern, using a matching algorithm that scans the subject +string just once (not counting lookaround assertions), and does not backtrack +(except when processing lookaround assertions). This has different +characteristics to the normal algorithm, and is not compatible with Perl. Some +of the features of PCRE2 patterns are not supported. Nevertheless, there are +times when this kind of matching can be useful. For a discussion of the two +matching algorithms, and a list of features that pcre2_dfa_match() does +not support, see the +pcre2matching +documentation. +
++The arguments for the pcre2_dfa_match() function are the same as for +pcre2_match(), plus two extras. The ovector within the match data block +is used in a different way, and this is described below. The other common +arguments are used in the same way as for pcre2_match(), so their +description is not repeated here. +
++The two additional arguments provide workspace for the function. The workspace +vector should contain at least 20 elements. It is used for keeping track of +multiple paths through the pattern tree. More workspace is needed for patterns +and subjects where there are a lot of potential matches. +
++Here is an example of a simple call to pcre2_dfa_match(): +
+ int wspace[20]; + pcre2_match_data *md = pcre2_match_data_create(4, NULL); + int rc = pcre2_dfa_match( + re, /* result of pcre2_compile() */ + "some string", /* the subject string */ + 11, /* the length of the subject string */ + 0, /* start at offset 0 in the subject */ + 0, /* default options */ + md, /* the match data block */ + NULL, /* a match context; NULL means use defaults */ + wspace, /* working space vector */ + 20); /* number of elements (NOT size in bytes) */ ++ +
+The unused bits of the options argument for pcre2_dfa_match() must +be zero. The only bits that may be set are PCRE2_ANCHORED, +PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, +PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, +PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last +four of these are exactly the same as for pcre2_match(), so their +description is not repeated here. +
+ PCRE2_PARTIAL_HARD + PCRE2_PARTIAL_SOFT ++These have the same general effect as they do for pcre2_match(), but the +details are slightly different. When PCRE2_PARTIAL_HARD is set for +pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the +subject is reached and there is still at least one matching possibility that +requires additional characters. This happens even if some complete matches have +already been found. When PCRE2_PARTIAL_SOFT is set, the return code +PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL if the end of the +subject is reached, there have been no complete matches, but there is still at +least one matching possibility. The portion of the string that was inspected +when the longest partial match was found is set as the first matching string in +both cases. There is a more detailed discussion of partial and multi-segment +matching, with examples, in the +pcre2partial +documentation. +
+ PCRE2_DFA_SHORTEST ++Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to stop as +soon as it has found one match. Because of the way the alternative algorithm +works, this is necessarily the shortest possible match at the first possible +matching point in the subject string. +
+ PCRE2_DFA_RESTART ++When pcre2_dfa_match() returns a partial match, it is possible to call it +again, with additional subject characters, and have it continue with the same +match. The PCRE2_DFA_RESTART option requests this action; when it is set, the +workspace and wscount options must reference the same vector as +before because data about the match so far is left in them after a partial +match. There is more discussion of this facility in the +pcre2partial +documentation. + +
+When pcre2_dfa_match() succeeds, it may have matched more than one +substring in the subject. Note, however, that all the matches from one run of +the function start at the same point in the subject. The shorter matches are +all initial substrings of the longer matches. For example, if the pattern +
+ <.*> ++is matched against the string +
+ This is <something> <something else> <something further> no more ++the three matched strings are +
+ <something> <something else> <something further> + <something> <something else> + <something> ++On success, the yield of the function is a number greater than zero, which is +the number of matched substrings. The offsets of the substrings are returned in +the ovector, and can be extracted by number in the same way as for +pcre2_match(), but the numbers bear no relation to any capture groups +that may exist in the pattern, because DFA matching does not support capturing. + +
+Calls to the convenience functions that extract substrings by name +return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used after a +DFA match. The convenience functions that extract substrings by number never +return PCRE2_ERROR_NOSUBSTRING. +
++The matched strings are stored in the ovector in reverse order of length; that +is, the longest matching string is first. If there were too many matches to fit +into the ovector, the yield of the function is zero, and the vector is filled +with the longest matches. +
++NOTE: PCRE2's "auto-possessification" optimization usually applies to character +repeats at the end of a pattern (as well as internally). For example, the +pattern "a\d+" is compiled as if it were "a\d++". For DFA matching, this +means that only one possible match is found. If you really do want multiple +matches in such cases, either use an ungreedy repeat such as "a\d+?" or set +the PCRE2_NO_AUTO_POSSESS option when compiling. +
++The pcre2_dfa_match() function returns a negative number when it fails. +Many of the errors are the same as for pcre2_match(), as described +above. +There are in addition the following errors that are specific to +pcre2_dfa_match(): +
+ PCRE2_ERROR_DFA_UITEM ++This return is given if pcre2_dfa_match() encounters an item in the +pattern that it does not support, for instance, the use of \C in a UTF mode or +a backreference. +
+ PCRE2_ERROR_DFA_UCOND ++This return is given if pcre2_dfa_match() encounters a condition item +that uses a backreference for the condition, or a test for recursion in a +specific capture group. These are not supported. +
+ PCRE2_ERROR_DFA_UINVALID_UTF ++This return is given if pcre2_dfa_match() is called for a pattern that +was compiled with PCRE2_MATCH_INVALID_UTF. This is not supported for DFA +matching. +
+ PCRE2_ERROR_DFA_WSSIZE ++This return is given if pcre2_dfa_match() runs out of space in the +workspace vector. +
+ PCRE2_ERROR_DFA_RECURSE ++When a recursion or subroutine call is processed, the matching function calls +itself recursively, using private memory for the ovector and workspace. +This error is given if the internal ovector is not large enough. This should be +extremely rare, as a vector of size 1000 is used. +
+ PCRE2_ERROR_DFA_BADRESTART ++When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option, +some plausibility checks are made on the contents of the workspace, which +should contain data about the previous partial match. If any of these checks +fail, this error is given. + +
+pcre2build(3), pcre2callout(3), pcre2demo(3), +pcre2matching(3), pcre2partial(3), pcre2posix(3), +pcre2sample(3), pcre2unicode(3). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 24 April 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2build.html b/doc/html/pcre2build.html new file mode 100644 index 0000000..d4b0d33 --- /dev/null +++ b/doc/html/pcre2build.html @@ -0,0 +1,652 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+PCRE2 is distributed with a configure script that can be used to build +the library in Unix-like environments using the applications known as +Autotools. Also in the distribution are files to support building using +CMake instead of configure. The text file +README +contains general information about building with Autotools (some of which is +repeated below), and also has some comments about building on various operating +systems. The files in the vms directory support building under OpenVMS. +There is a lot more information about building PCRE2 without using +Autotools (including information about using CMake and building "by +hand") in the text file called +NON-AUTOTOOLS-BUILD. +You should consult this file as well as the +README +file if you are building in a non-Unix-like environment. +
++The rest of this document describes the optional features of PCRE2 that can be +selected when the library is compiled. It assumes use of the configure +script, where the optional features are selected or deselected by providing +options to configure before running the make command. However, the +same options can be selected in both Unix-like and non-Unix-like environments +if you are using CMake instead of configure to build PCRE2. +
++If you are not using Autotools or CMake, option selection can be done by +editing the config.h file, or by passing parameter settings to the +compiler, as described in +NON-AUTOTOOLS-BUILD. +
++The complete list of options for configure (which includes the standard +ones such as the selection of the installation directory) can be obtained by +running +
+ ./configure --help ++The following sections include descriptions of "on/off" options whose names +begin with --enable or --disable. Because of the way that configure +works, --enable and --disable always come in pairs, so the complementary option +always exists as well, but as it specifies the default, it is not described. +Options that specify values have names that start with --with. At the end of a +configure run, a summary of the configuration is output. + +
+By default, a library called libpcre2-8 is built, containing functions +that take string arguments contained in arrays of bytes, interpreted either as +single-byte characters, or UTF-8 strings. You can also build two other +libraries, called libpcre2-16 and libpcre2-32, which process +strings that are contained in arrays of 16-bit and 32-bit code units, +respectively. These can be interpreted either as single-unit characters or +UTF-16/UTF-32 strings. To build these additional libraries, add one or both of +the following to the configure command: +
+ --enable-pcre2-16 + --enable-pcre2-32 ++If you do not want the 8-bit library, add +
+ --disable-pcre2-8 ++as well. At least one of the three libraries must be built. Note that the POSIX +wrapper is for the 8-bit library only, and that pcre2grep is an 8-bit +program. Neither of these are built if you select only the 16-bit or 32-bit +libraries. + +
+The Autotools PCRE2 building process uses libtool to build both shared +and static libraries by default. You can suppress an unwanted library by adding +one of +
+ --disable-shared + --disable-static ++to the configure command. Setting --disable-shared ensures that PCRE2 +libraries are built as static libraries. The binaries that are then created as +part of the build process (for example, pcre2test and pcre2grep) +are linked statically with one or more PCRE2 libraries, but may also be +dynamically linked with other libraries such as libc. If you want these +binaries to be fully statically linked, you can set LDFLAGS like this: +
+By default, PCRE2 is built with support for Unicode and UTF character strings. +To build it without Unicode support, add +
+ --disable-unicode ++to the configure command. This setting applies to all three libraries. It +is not possible to build one library with Unicode support and another without +in the same configuration. + +
+Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16 +or UTF-32. To do that, applications that use the library can set the PCRE2_UTF +option when they call pcre2_compile() to compile a pattern. +Alternatively, patterns may be started with (*UTF) unless the application has +locked this out by setting PCRE2_NEVER_UTF. +
++UTF support allows the libraries to process character code points up to +0x10ffff in the strings that they handle. Unicode support also gives access to +the Unicode properties of characters, using pattern escapes such as \P, \p, +and \X. Only the general category properties such as Lu and Nd, +script names, and some bi-directional properties are supported. Details are +given in the +pcre2pattern +documentation. +
++Pattern escapes such as \d and \w do not by default make use of Unicode +properties. The application can request that they do by setting the PCRE2_UCP +option. Unless the application has set PCRE2_NEVER_UCP, a pattern may also +request this by starting with (*UCP). +
++The \C escape sequence, which matches a single code unit, even in a UTF mode, +can cause unpredictable behaviour because it may leave the current matching +point in the middle of a multi-code-unit character. The application can lock it +out by setting the PCRE2_NEVER_BACKSLASH_C option when calling +pcre2_compile(). There is also a build-time option +
+ --enable-never-backslash-C ++(note the upper case C) which locks out the use of \C entirely. + +
+Just-in-time (JIT) compiler support is included in the build by specifying +
+ --enable-jit ++This support is available only for certain hardware architectures. If this +option is set for an unsupported architecture, a building error occurs. +If in doubt, use +
+ --enable-jit=auto ++which enables JIT only if the current hardware is supported. You can check +if JIT is enabled in the configuration summary that is output at the end of a +configure run. If you are enabling JIT under SELinux you may also want to +add +
+ --enable-jit-sealloc ++which enables the use of an execmem allocator in JIT that is compatible with +SELinux. This has no effect if JIT is not enabled. See the +pcre2jit +documentation for a discussion of JIT usage. When JIT support is enabled, +pcre2grep automatically makes use of it, unless you add +
+ --disable-pcre2grep-jit ++to the configure command. + +
+By default, PCRE2 interprets the linefeed (LF) character as indicating the end +of a line. This is the normal newline character on Unix-like systems. You can +compile PCRE2 to use carriage return (CR) instead, by adding +
+ --enable-newline-is-cr ++to the configure command. There is also an --enable-newline-is-lf option, +which explicitly specifies linefeed as the newline character. + +
+Alternatively, you can specify that line endings are to be indicated by the +two-character sequence CRLF (CR immediately followed by LF). If you want this, +add +
+ --enable-newline-is-crlf ++to the configure command. There is a fourth option, specified by +
+ --enable-newline-is-anycrlf ++which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as +indicating a line ending. A fifth option, specified by +
+ --enable-newline-is-any ++causes PCRE2 to recognize any Unicode newline sequence. The Unicode newline +sequences are the three just mentioned, plus the single characters VT (vertical +tab, U+000B), FF (form feed, U+000C), NEL (next line, U+0085), LS (line +separator, U+2028), and PS (paragraph separator, U+2029). The final option is +
+ --enable-newline-is-nul ++which causes NUL (binary zero) to be set as the default line-ending character. + +
+Whatever default line ending convention is selected when PCRE2 is built can be +overridden by applications that use the library. At build time it is +recommended to use the standard for your operating system. +
++By default, the sequence \R in a pattern matches any Unicode newline sequence, +independently of what has been selected as the line ending sequence. If you +specify +
+ --enable-bsr-anycrlf ++the default is changed so that \R matches only CR, LF, or CRLF. Whatever is +selected when PCRE2 is built can be overridden by applications that use the +library. + +
+Within a compiled pattern, offset values are used to point from one part to +another (for example, from an opening parenthesis to an alternation +metacharacter). By default, in the 8-bit and 16-bit libraries, two-byte values +are used for these offsets, leading to a maximum size for a compiled pattern of +around 64 thousand code units. This is sufficient to handle all but the most +gigantic patterns. Nevertheless, some people do want to process truly enormous +patterns, so it is possible to compile PCRE2 to use three-byte or four-byte +offsets by adding a setting such as +
+ --with-link-size=3 ++to the configure command. The value given must be 2, 3, or 4. For the +16-bit library, a value of 3 is rounded up to 4. In these libraries, using +longer offsets slows down the operation of PCRE2 because it has to load +additional data when handling them. For the 32-bit library the value is always +4 and cannot be overridden; the value of --with-link-size is ignored. + +
+The pcre2_match() function increments a counter each time it goes round +its main loop. Putting a limit on this counter controls the amount of computing +resource used by a single call to pcre2_match(). The limit can be changed +at run time, as described in the +pcre2api +documentation. The default is 10 million, but this can be changed by adding a +setting such as +
+ --with-match-limit=500000 ++to the configure command. This setting also applies to the +pcre2_dfa_match() matching function, and to JIT matching (though the +counting is done differently). + +
+The pcre2_match() function uses heap memory to record backtracking +points. The more nested backtracking points there are (that is, the deeper the +search tree), the more memory is needed. There is an upper limit, specified in +kibibytes (units of 1024 bytes). This limit can be changed at run time, as +described in the +pcre2api +documentation. The default limit (in effect unlimited) is 20 million. You can +change this by a setting such as +
+ --with-heap-limit=500 ++which limits the amount of heap to 500 KiB. This limit applies only to +interpretive matching in pcre2_match() and pcre2_dfa_match(), which +may also use the heap for internal workspace when processing complicated +patterns. This limit does not apply when JIT (which has its own memory +arrangements) is used. + +
+You can also explicitly limit the depth of nested backtracking in the +pcre2_match() interpreter. This limit defaults to the value that is set +for --with-match-limit. You can set a lower default limit by adding, for +example, +
+ --with-match-limit-depth=10000 ++to the configure command. This value can be overridden at run time. This +depth limit indirectly limits the amount of heap memory that is used, but +because the size of each backtracking "frame" depends on the number of +capturing parentheses in a pattern, the amount of heap that is used before the +limit is reached varies from pattern to pattern. This limit was more useful in +versions before 10.30, where function recursion was used for backtracking. + +
+As well as applying to pcre2_match(), the depth limit also controls +the depth of recursive function calls in pcre2_dfa_match(). These are +used for lookaround assertions, atomic groups, and recursion within patterns. +The limit does not apply to JIT matching. +
++Lookbehind assertions in which one or more branches can match a variable number +of characters are supported only if there is a maximum matching length for each +top-level branch. There is a limit to this maximum that defaults to 255 +characters. You can alter this default by a setting such as +
+ --with-max-varlookbehind=100 ++The limit can be changed at runtime by calling +pcre2_set_max_varlookbehind(). Lookbehind assertions in which every +branch matches a fixed number of characters (not necessarily all the same) are +not constrained by this limit. + +
+PCRE2 uses fixed tables for processing characters whose code points are less +than 256. By default, PCRE2 is built with a set of tables that are distributed +in the file src/pcre2_chartables.c.dist. These tables are for ASCII codes +only. If you add +
+ --enable-rebuild-chartables ++to the configure command, the distributed tables are no longer used. +Instead, a program called pcre2_dftables is compiled and run. This +outputs the source for new set of tables, created in the default locale of your +C run-time system. This method of replacing the tables does not work if you are +cross compiling, because pcre2_dftables needs to be run on the local +host and therefore not compiled with the cross compiler. + +
+If you need to create alternative tables when cross compiling, you will have to +do so "by hand". There may also be other reasons for creating tables manually. +To cause pcre2_dftables to be built on the local host, run a normal +compiling command, and then run the program with the output file as its +argument, for example: +
+ cc src/pcre2_dftables.c -o pcre2_dftables + ./pcre2_dftables src/pcre2_chartables.c ++This builds the tables in the default locale of the local host. If you want to +specify a locale, you must use the -L option: +
+ LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c ++You can also specify -b (with or without -L). This causes the tables to be +written in binary instead of as source code. A set of binary tables can be +loaded into memory by an application and passed to pcre2_compile() in the +same way as tables created by calling pcre2_maketables(). The tables are +just a string of bytes, independent of hardware characteristics such as +endianness. This means they can be bundled with an application that runs in +different environments, to ensure consistent behaviour. + +
+PCRE2 assumes by default that it will run in an environment where the character +code is ASCII or Unicode, which is a superset of ASCII. This is the case for +most computer operating systems. PCRE2 can, however, be compiled to run in an +8-bit EBCDIC environment by adding +
+ --enable-ebcdic --disable-unicode ++to the configure command. This setting implies +--enable-rebuild-chartables. You should only use it if you know that you are in +an EBCDIC environment (for example, an IBM mainframe operating system). + +
+It is not possible to support both EBCDIC and UTF-8 codes in the same version +of the library. Consequently, --enable-unicode and --enable-ebcdic are mutually +exclusive. +
++The EBCDIC character that corresponds to an ASCII LF is assumed to have the +value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In +such an environment you should use +
+ --enable-ebcdic-nl25 ++as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR has the +same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is not +chosen as LF is made to correspond to the Unicode NEL character (which, in +Unicode, is 0x85). + +
+The options that select newline behaviour, such as --enable-newline-is-cr, +and equivalent run-time options, refer to these character values in an EBCDIC +environment. +
++By default pcre2grep supports the use of callouts with string arguments +within the patterns it is matching. There are two kinds: one that generates +output using local code, and another that calls an external program or script. +If --disable-pcre2grep-callout-fork is added to the configure command, +only the first kind of callout is supported; if --disable-pcre2grep-callout is +used, all callouts are completely ignored. For more details of pcre2grep +callouts, see the +pcre2grep +documentation. +
++By default, pcre2grep reads all files as plain text. You can build it so +that it recognizes files whose names end in .gz or .bz2, and reads +them with libz or libbz2, respectively, by adding one or both of +
+ --enable-pcre2grep-libz + --enable-pcre2grep-libbz2 ++to the configure command. These options naturally require that the +relevant libraries are installed on your system. Configuration will fail if +they are not. + +
+pcre2grep uses an internal buffer to hold a "window" on the file it is +scanning, in order to be able to output "before" and "after" lines when it +finds a match. The default starting size of the buffer is 20KiB. The buffer +itself is three times this size, but because of the way it is used for holding +"before" lines, the longest line that is guaranteed to be processable is the +notional buffer size. If a longer line is encountered, pcre2grep +automatically expands the buffer, up to a specified maximum size, whose default +is 1MiB or the starting size, whichever is the larger. You can change the +default parameter values by adding, for example, +
+ --with-pcre2grep-bufsize=51200 + --with-pcre2grep-max-bufsize=2097152 ++to the configure command. The caller of pcre2grep can override +these values by using --buffer-size and --max-buffer-size on the command line. + +
+If you add one of +
+ --enable-pcre2test-libreadline + --enable-pcre2test-libedit ++to the configure command, pcre2test is linked with the +libreadline orlibedit library, respectively, and when its input is +from a terminal, it reads it using the readline() function. This provides +line-editing and history facilities. Note that libreadline is +GPL-licensed, so if you distribute a binary of pcre2test linked in this +way, there may be licensing issues. These can be avoided by linking instead +with libedit, which has a BSD licence. + +
+Setting --enable-pcre2test-libreadline causes the -lreadline option to be +added to the pcre2test build. In many operating environments with a +system-installed readline library this is sufficient. However, in some +environments (e.g. if an unmodified distribution version of readline is in +use), some extra configuration may be necessary. The INSTALL file for +libreadline says this: +
+ "Readline uses the termcap functions, but does not link with + the termcap or curses library itself, allowing applications + which link with readline the to choose an appropriate library." ++If your environment has not been set up so that an appropriate library is +automatically included, you may need to add something like +
+ LIBS="-ncurses" ++immediately before the configure command. + +
+If you add +
+ --enable-debug ++to the configure command, additional debugging code is included in the +build. This feature is intended for use by the PCRE2 maintainers. + +
+If you add +
+ --enable-valgrind ++to the configure command, PCRE2 will use valgrind annotations to mark +certain memory regions as unaddressable. This allows it to detect invalid +memory accesses, and is mostly useful for debugging PCRE2 itself. + +
+If your C compiler is gcc, you can build a version of PCRE2 that can generate a +code coverage report for its test suite. To enable this, you must install +lcov version 1.6 or above. Then specify +
+ --enable-coverage ++to the configure command and build PCRE2 in the usual way. + +
+Note that using ccache (a caching C compiler) is incompatible with code +coverage reporting. If you have configured ccache to run automatically +on your system, you must set the environment variable +
+ CCACHE_DISABLE=1 ++before running make to build PCRE2, so that ccache is not used. + +
+When --enable-coverage is used, the following addition targets are added to the +Makefile: +
+ make coverage ++This creates a fresh coverage report for the PCRE2 test suite. It is equivalent +to running "make coverage-reset", "make coverage-baseline", "make check", and +then "make coverage-report". +
+ make coverage-reset ++This zeroes the coverage counters, but does nothing else. +
+ make coverage-baseline ++This captures baseline coverage information. +
+ make coverage-report ++This creates the coverage report. +
+ make coverage-clean-report ++This removes the generated coverage report without cleaning the coverage data +itself. +
+ make coverage-clean-data ++This removes the captured coverage data without removing the coverage files +created at compile time (*.gcno). +
+ make coverage-clean ++This cleans all coverage data including the generated coverage report. For more +information about code coverage, see the gcov and lcov +documentation. + +
+The C99 standard defines formatting modifiers z and t for size_t and +ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in +environments other than old versions of Microsoft Visual Studio when +__STDC_VERSION__ is defined and has a value greater than or equal to 199901L +(indicating support for C99). +However, there is at least one environment that claims to be C99 but does not +support these modifiers. If +
+ --disable-percent-zt ++is specified, no use is made of the z or t modifiers. Instead of %td or %zu, +a suitable format is used depending in the size of long for the platform. + +
+There is a special option for use by people who want to run fuzzing tests on +PCRE2: +
+ --enable-fuzz-support ++At present this applies only to the 8-bit library. If set, it causes an extra +library called libpcre2-fuzzsupport.a to be built, but not installed. This +contains a single function called LLVMFuzzerTestOneInput() whose arguments are +a pointer to a string and the length of the string. When called, this function +tries to compile the string as a pattern, and if that succeeds, to match it. +This is done both with no options and with some random options bits that are +generated from the string. + +
+Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck +to be created. This is normally run under valgrind or used when PCRE2 is +compiled with address sanitizing enabled. It calls the fuzzing function and +outputs information about what it is doing. The input strings are specified by +arguments: if an argument starts with "=" the rest of it is a literal input +string. Otherwise, it is assumed to be a file name, and the contents of the +file are the test string. +
++In versions of PCRE2 prior to 10.30, there were two ways of handling +backtracking in the pcre2_match() function. The default was to use the +system stack, but if +
+ --disable-stack-for-recursion ++was set, memory on the heap was used. From release 10.30 onwards this has +changed (the stack is no longer used) and this option now does nothing except +give a warning. + +
+pcre2api(3), pcre2-config(3). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 15 April 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2callout.html b/doc/html/pcre2callout.html new file mode 100644 index 0000000..cdb65ad --- /dev/null +++ b/doc/html/pcre2callout.html @@ -0,0 +1,480 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+#include <pcre2.h> +
+
+int (*pcre2_callout)(pcre2_callout_block *, void *);
+
+
+int pcre2_callout_enumerate(const pcre2_code *code,
+ int (*callback)(pcre2_callout_enumerate_block *, void *),
+ void *user_data);
+
+PCRE2 provides a feature called "callout", which is a means of temporarily +passing control to the caller of PCRE2 in the middle of pattern matching. The +caller of PCRE2 provides an external function by putting its entry point in +a match context (see pcre2_set_callout() in the +pcre2api +documentation). +
++When using the pcre2_substitute() function, an additional callout feature +is available. This does a callout after each change to the subject string and +is described in the +pcre2api +documentation; the rest of this document is concerned with callouts during +pattern matching. +
++Within a regular expression, (?C<arg>) indicates a point at which the external +function is to be called. Different callout points can be identified by putting +a number less than 256 after the letter C. The default value is zero. +Alternatively, the argument may be a delimited string. The starting delimiter +must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the +start, except for {, where the ending delimiter is }. If the ending delimiter +is needed within the string, it must be doubled. For example, this pattern has +two callout points: +
+ (?C1)abc(?C"some ""arbitrary"" text")def ++If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2 +automatically inserts callouts, all with number 255, before each item in the +pattern except for immediately before or after an explicit callout. For +example, if PCRE2_AUTO_CALLOUT is used with the pattern +
+ A(?C3)B ++it is processed as if it were +
+ (?C255)A(?C3)B(?C255) ++Here is a more complicated example: +
+ A(\d{2}|--)
+
+With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were
+
+ (?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
+
+Notice that there is a callout before and after each parenthesis and
+alternation bar. If the pattern contains a conditional group whose condition is
+an assertion, an automatic callout is inserted immediately before the
+condition. Such a callout may also be inserted explicitly, for example:
++ (?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de) ++This applies only to assertion conditions (because they are themselves +independent groups). + +
+Callouts can be useful for tracking the progress of pattern matching. The +pcre2test +program has a pattern qualifier (/auto_callout) that sets automatic callouts. +When any callouts are present, the output from pcre2test indicates how +the pattern is being matched. This is useful information when you are trying to +optimize the performance of a particular pattern. +
++You should be aware that, because of optimizations in the way PCRE2 compiles +and matches patterns, callouts sometimes do not happen exactly as you might +expect. +
++At compile time, PCRE2 "auto-possessifies" repeated items when it knows that +what follows cannot be part of the repeat. For example, a+[bc] is compiled as +if it were a++[bc]. The pcre2test output when this pattern is compiled +with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string +"aaaa" is: +
+ --->aaaa + +0 ^ a+ + +2 ^ ^ [bc] + No match ++This indicates that when matching [bc] fails, there is no backtracking into a+ +(because it is being treated as a++) and therefore the callouts that would be +taken for the backtracks do not occur. You can disable the auto-possessify +feature by passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting +the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this: +
+ --->aaaa + +0 ^ a+ + +2 ^ ^ [bc] + +2 ^ ^ [bc] + +2 ^ ^ [bc] + +2 ^^ [bc] + No match ++This time, when matching [bc] fails, the matcher backtracks into a+ and tries +again, repeatedly, until a+ itself fails. + +
+By default, an optimization is applied when .* is the first significant item in +a pattern. If PCRE2_DOTALL is set, so that the dot can match any character, the +pattern is automatically anchored. If PCRE2_DOTALL is not set, a match can +start only after an internal newline or at the beginning of the subject, and +pcre2_compile() remembers this. If a pattern has more than one top-level +branch, automatic anchoring occurs if all branches are anchorable. +
++This optimization is disabled, however, if .* is in an atomic group or if there +is a backreference to the capture group in which it appears. It is also +disabled if the pattern contains (*PRUNE) or (*SKIP). However, the presence of +callouts does not affect it. +
++For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT and +applied to the string "aa", the pcre2test output is: +
+ --->aa + +0 ^ .* + +2 ^ ^ \d + +2 ^^ \d + +2 ^ \d + No match ++This shows that all match attempts start at the beginning of the subject. In +other words, the pattern is anchored. You can disable this optimization by +passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or starting the +pattern with (*NO_DOTSTAR_ANCHOR). In this case, the output changes to: +
+ --->aa + +0 ^ .* + +2 ^ ^ \d + +2 ^^ \d + +2 ^ \d + +0 ^ .* + +2 ^^ \d + +2 ^ \d + No match ++This shows more match attempts, starting at the second subject character. +Another optimization, described in the next section, means that there is no +subsequent attempt to match with an empty subject. + +
+Other optimizations that provide fast "no match" results also affect callouts. +For example, if the pattern is +
+ ab(?C4)cd ++PCRE2 knows that any matching string must contain the letter "d". If the +subject string is "abyz", the lack of "d" means that matching doesn't ever +start, and the callout is never reached. However, with "abyd", though the +result is still no match, the callout is obeyed. + +
+For most patterns PCRE2 also knows the minimum length of a matching string, and +will immediately give a "no match" return without actually running a match if +the subject is not long enough, or, for unanchored patterns, if it has been +scanned far enough. +
++You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE +option to pcre2_compile(), or by starting the pattern with +(*NO_START_OPT). This slows down the matching process, but does ensure that +callouts such as the example above are obeyed. +
++During matching, when PCRE2 reaches a callout point, if an external function is +provided in the match context, it is called. This applies to both normal, +DFA, and JIT matching. The first argument to the callout function is a pointer +to a pcre2_callout block. The second argument is the void * callout data +that was supplied when the callout was set up by calling +pcre2_set_callout() (see the +pcre2api +documentation). The callout block structure contains the following fields, not +necessarily in this order: +
+ uint32_t version; + uint32_t callout_number; + uint32_t capture_top; + uint32_t capture_last; + uint32_t callout_flags; + PCRE2_SIZE *offset_vector; + PCRE2_SPTR mark; + PCRE2_SPTR subject; + PCRE2_SIZE subject_length; + PCRE2_SIZE start_match; + PCRE2_SIZE current_position; + PCRE2_SIZE pattern_position; + PCRE2_SIZE next_item_length; + PCRE2_SIZE callout_string_offset; + PCRE2_SIZE callout_string_length; + PCRE2_SPTR callout_string; ++The version field contains the version number of the block format. The +current version is 2; the three callout string fields were added for version 1, +and the callout_flags field for version 2. If you are writing an +application that might use an earlier release of PCRE2, you should check the +version number before accessing any of these fields. The version number will +increase in future if more fields are added, but the intention is never to +remove any of the existing fields. + +
+For a numerical callout, callout_string is NULL, and callout_number +contains the number of the callout, in the range 0-255. This is the number +that follows (?C for callouts that part of the pattern; it is 255 for +automatically generated callouts. +
++For callouts with string arguments, callout_number is always zero, and +callout_string points to the string that is contained within the compiled +pattern. Its length is given by callout_string_length. Duplicated ending +delimiters that were present in the original pattern string have been turned +into single characters, but there is no other processing of the callout string +argument. An additional code unit containing binary zero is present after the +string, but is not included in the length. The delimiter that was used to start +the string is also stored within the pattern, immediately before the string +itself. You can access this delimiter as callout_string[-1] if you need +it. +
++The callout_string_offset field is the code unit offset to the start of +the callout argument string within the original pattern string. This is +provided for the benefit of applications such as script languages that might +need to report errors in the callout string within the pattern. +
++The remaining fields in the callout block are the same for both kinds of +callout. +
++The offset_vector field is a pointer to a vector of capturing offsets +(the "ovector"). You may read the elements in this vector, but you must not +change any of them. +
++For calls to pcre2_match(), the offset_vector field is not (since +release 10.30) a pointer to the actual ovector that was passed to the matching +function in the match data block. Instead it points to an internal ovector of a +size large enough to hold all possible captured substrings in the pattern. Note +that whenever a recursion or subroutine call within a pattern completes, the +capturing state is reset to what it was before. +
++The capture_last field contains the number of the most recently captured +substring, and the capture_top field contains one more than the number of +the highest numbered captured substring so far. If no substrings have yet been +captured, the value of capture_last is 0 and the value of +capture_top is 1. The values of these fields do not always differ by one; +for example, when the callout in the pattern ((a)(b))(?C2) is taken, +capture_last is 1 but capture_top is 4. +
++The contents of ovector[2] to ovector[<capture_top>*2-1] can be inspected in +order to extract substrings that have been matched so far, in the same way as +extracting substrings after a match has completed. The values in ovector[0] and +ovector[1] are always PCRE2_UNSET because the match is by definition not +complete. Substrings that have not been captured but whose numbers are less +than capture_top also have both of their ovector slots set to +PCRE2_UNSET. +
++For DFA matching, the offset_vector field points to the ovector that was +passed to the matching function in the match data block for callouts at the top +level, but to an internal ovector during the processing of pattern recursions, +lookarounds, and atomic groups. However, these ovectors hold no useful +information because pcre2_dfa_match() does not support substring +capturing. The value of capture_top is always 1 and the value of +capture_last is always 0 for DFA matching. +
++The subject and subject_length fields contain copies of the values +that were passed to the matching function. +
++The start_match field normally contains the offset within the subject at +which the current match attempt started. However, if the escape sequence \K +has been encountered, this value is changed to reflect the modified starting +point. If the pattern is not anchored, the callout function may be called +several times from the same point in the pattern for different starting points +in the subject. +
++The current_position field contains the offset within the subject of the +current match pointer. +
++The pattern_position field contains the offset in the pattern string to +the next item to be matched. +
++The next_item_length field contains the length of the next item to be +processed in the pattern string. When the callout is at the end of the pattern, +the length is zero. When the callout precedes an opening parenthesis, the +length includes meta characters that follow the parenthesis. For example, in a +callout before an assertion such as (?=ab) the length is 3. For an alternation +bar or a closing parenthesis, the length is one, unless a closing parenthesis +is followed by a quantifier, in which case its length is included. (This +changed in release 10.23. In earlier releases, before an opening parenthesis +the length was that of the entire group, and before an alternation bar or a +closing parenthesis the length was zero.) +
++The pattern_position and next_item_length fields are intended to +help in distinguishing between different automatic callouts, which all have the +same callout number. However, they are set for all callouts, and are used by +pcre2test to show the next item to be matched when displaying callout +information. +
++In callouts from pcre2_match() the mark field contains a pointer to +the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or +(*THEN) item in the match, or NULL if no such items have been passed. Instances +of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In +callouts from the DFA matching function this field always contains NULL. +
++The callout_flags field is always zero in callouts from +pcre2_dfa_match() or when JIT is being used. When pcre2_match() +without JIT is used, the following bits may be set: +
+ PCRE2_CALLOUT_STARTMATCH ++This is set for the first callout after the start of matching for each new +starting position in the subject. +
+ PCRE2_CALLOUT_BACKTRACK ++This is set if there has been a matching backtrack since the previous callout, +or since the start of matching if this is the first callout from a +pcre2_match() run. + +
+Both bits are set when a backtrack has caused a "bumpalong" to a new starting +position in the subject. Output from pcre2test does not indicate the +presence of these bits unless the callout_extra modifier is set. +
++The information in the callout_flags field is provided so that +applications can track and tell their users how matching with backtracking is +done. This can be useful when trying to optimize patterns, or just to +understand how PCRE2 works. There is no support in pcre2_dfa_match() +because there is no backtracking in DFA matching, and there is no support in +JIT because JIT is all about maximimizing matching performance. In both these +cases the callout_flags field is always zero. +
++The external callout function returns an integer to PCRE2. If the value is +zero, matching proceeds as normal. If the value is greater than zero, matching +fails at the current point, but the testing of other matching possibilities +goes ahead, just as if a lookahead assertion had failed. If the value is less +than zero, the match is abandoned, and the matching function returns the +negative value. +
++Negative values should normally be chosen from the set of PCRE2_ERROR_xxx +values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match" +failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout +functions; it will never be used by PCRE2 itself. +
+
+int pcre2_callout_enumerate(const pcre2_code *code,
+ int (*callback)(pcre2_callout_enumerate_block *, void *),
+ void *user_data);
+
+
+A script language that supports the use of string arguments in callouts might
+like to scan all the callouts in a pattern before running the match. This can
+be done by calling pcre2_callout_enumerate(). The first argument is a
+pointer to a compiled pattern, the second points to a callback function, and
+the third is arbitrary user data. The callback function is called for every
+callout in the pattern in the order in which they appear. Its first argument is
+a pointer to a callout enumeration block, and its second argument is the
+user_data value that was passed to pcre2_callout_enumerate(). The
+data block contains the following fields:
+
+ version Block version number + pattern_position Offset to next item in pattern + next_item_length Length of next item in pattern + callout_number Number for numbered callouts + callout_string_offset Offset to string within pattern + callout_string_length Length of callout string + callout_string Points to callout string or is NULL ++The version number is currently 0. It will increase if new fields are ever +added to the block. The remaining fields are the same as their namesakes in the +pcre2_callout block that is used for callouts during matching, as +described +above. + +
+Note that the value of pattern_position is unique for each callout. +However, if a callout occurs inside a group that is quantified with a non-zero +minimum or a fixed maximum, the group is replicated inside the compiled +pattern. For example, a pattern such as /(a){2}/ is compiled as if it were +/(a)(a)/. This means that the callout will be enumerated more than once, but +with the same value for pattern_position in each case. +
++The callback function should normally return zero. If it returns a non-zero +value, scanning the pattern stops, and that value is returned from +pcre2_callout_enumerate(). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 19 January 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2compat.html b/doc/html/pcre2compat.html new file mode 100644 index 0000000..d60182e --- /dev/null +++ b/doc/html/pcre2compat.html @@ -0,0 +1,276 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+DIFFERENCES BETWEEN PCRE2 AND PERL
+
+
+This document describes some of the known differences in the ways that PCRE2 +and Perl handle regular expressions. The differences described here are with +respect to Perl version 5.38.0, but as both Perl and PCRE2 are continually +changing, the information may at times be out of date. +
++1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the +behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the +next character unless it is the start of a newline sequence. This means that, +if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF +(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using +EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline +indicator. +
++2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does +have are given in the +pcre2unicode +page. +
++3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but +they do not mean what you might think. For example, (?!a){3} does not assert +that the next three characters are not "a". It just asserts that the next +character is not "a" three times (in principle; PCRE2 optimizes this to run the +assertion just once). Perl allows some repeat quantifiers on other assertions, +for example, \b* , but these do not seem to have any use. PCRE2 does not allow +any kind of quantifier on non-lookaround assertions. +
++4. If a braced quantifier such as {1,2} appears where there is nothing to +repeat (for example, at the start of a branch), PCRE2 raises an error whereas +Perl treats the quantifier characters as literal. +
++5. Capture groups that occur inside negative lookaround assertions are counted, +but their entries in the offsets vector are set only when a negative assertion +is a condition that has a matching branch (that is, the condition is false). +Perl may set such capture groups in other circumstances. +
++6. The following Perl escape sequences are not supported: \F, \l, \L, \u, +\U, and \N when followed by a character name. \N on its own, matching a +non-newline character, and \N{U+dd..}, matching a Unicode code point, are +supported. The escapes that modify the case of following letters are +implemented by Perl's general string-handling and are not part of its pattern +matching engine. If any of these are encountered by PCRE2, an error is +generated by default. However, if either of the PCRE2_ALT_BSUX or +PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript +interprets them. +
++7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is +built with Unicode support (the default). The properties that can be tested +with \p and \P are limited to the general category properties such as Lu and +Nd, the derived properties Any and LC (synonym L&), script names such as Greek +or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and +Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See +the +pcre2pattern +documentation for details. The long synonyms for property names that Perl +supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted +to prefix any of these properties with "Is". +
++8. PCRE2 supports the \Q...\E escape for quoting substrings. Characters +in between are treated as literals. However, this is slightly different from +Perl in that $ and @ are also handled as literals inside the quotes. In Perl, +they cause variable interpolation (PCRE2 does not have variables). Also, Perl +does "double-quotish backslash interpolation" on any backslashes between \Q +and \E which, its documentation says, "may lead to confusing results". PCRE2 +treats a backslash between \Q and \E just like any other character. Note the +following examples: +
+ Pattern PCRE2 matches Perl matches + + \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz + \Qabc\$xyz\E abc\$xyz abc\$xyz + \Qabc\E\$\Qxyz\E abc$xyz abc$xyz + \QA\B\E A\B A\B + \Q\\E \ \\E ++The \Q...\E sequence is recognized both inside and outside character classes +by both PCRE2 and Perl. + +
+9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) +constructions. However, PCRE2 does have a "callout" feature, which allows an +external function to be called during pattern matching. See the +pcre2callout +documentation for details. +
++10. Subroutine calls (whether recursive or not) were treated as atomic groups +up to PCRE2 release 10.23, but from release 10.30 this changed, and +backtracking into subroutine calls is now supported, as in Perl. +
++11. In PCRE2, if any of the backtracking control verbs are used in a group that +is called as a subroutine (whether or not recursively), their effect is +confined to that group; it does not extend to the surrounding pattern. This is +not always the case in Perl. In particular, if (*THEN) is present in a group +that is called as a subroutine, its action is limited to that group, even if +the group does not contain any | characters. Note that such groups are +processed as anchored at the point where they are tested. +
++12. If a pattern contains more than one backtracking control verb, the first +one that is backtracked onto acts. For example, in the pattern +A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C +triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the +same as PCRE2, but there are cases where it differs. +
++13. There are some differences that are concerned with the settings of captured +strings when part of a pattern is repeated. For example, matching "aba" against +the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to +"b". +
++14. PCRE2's handling of duplicate capture group numbers and names is not as +general as Perl's. This is a consequence of the fact the PCRE2 works internally +just with numbers, using an external table to translate between numbers and +names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two +capture groups have the same number but different names, is not supported, and +causes an error at compile time. If it were allowed, it would not be possible +to distinguish which group matched, because both names map to capture group +number 1. To avoid this confusing situation, an error is given at compile time. +
++15. Perl used to recognize comments in some places that PCRE2 does not, for +example, between the ( and ? at the start of a group. If the /x modifier is +set, Perl allowed white space between ( and ? though the latest Perls give an +error (for a while it was just deprecated). There may still be some cases where +Perl behaves differently. +
++16. Perl, when in warning mode, gives warnings for character classes such as +[A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no +warning features, so it gives an error in these cases because they are almost +certainly user mistakes. +
++17. In PCRE2, the upper/lower case character properties Lu and Ll are not +affected when case-independent matching is specified. For example, \p{Lu} +always matches an upper case letter. I think Perl has changed in this respect; +in the release at the time of writing (5.38), \p{Lu} and \p{Ll} match all +letters, regardless of case, when case independence is specified. +
++18. From release 5.32.0, Perl locks out the use of \K in lookaround +assertions. From release 10.38 PCRE2 does the same by default. However, there +is an option for re-enabling the previous behaviour. When this option is set, +\K is acted on when it occurs in positive assertions, but is ignored in +negative assertions. +
+
+19. PCRE2 provides some extensions to the Perl regular expression facilities.
+Perl 5.10 included new features that were not in earlier versions of Perl, some
+of which (such as named parentheses) were in PCRE2 for some time before. This
+list is with respect to Perl 5.38:
+
+
+(a) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
+meta-character matches only at the very end of the string.
+
+
+(b) A backslash followed by a letter with no special meaning is faulted. (Perl
+can be made to issue a warning.)
+
+
+(c) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
+inverted, that is, by default they are not greedy, but if followed by a
+question mark they are.
+
+
+(d) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
+only at the first matching position in the subject string.
+
+
+(e) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART
+options have no Perl equivalents.
+
+
+(f) The \R escape sequence can be restricted to match only CR, LF, or CRLF
+by the PCRE2_BSR_ANYCRLF option.
+
+
+(g) The callout facility is PCRE2-specific. Perl supports codeblocks and
+variable interpolation, but not general hooks on every match.
+
+
+(h) The partial matching facility is PCRE2-specific.
+
+
+(i) The alternative matching function (pcre2_dfa_match() matches in a
+different way and is not Perl-compatible.
+
+
+(j) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
+the start of a pattern. These set overall options that cannot be changed within
+the pattern.
+
+
+(k) PCRE2 supports non-atomic positive lookaround assertions. This is an
+extension to the lookaround facilities. The default, Perl-compatible
+lookarounds are atomic.
+
+
+(l) There are three syntactical items in patterns that can refer to a capturing
+group by number: back references such as \g{2}, subroutine calls such as (?3),
+and condition references such as (?(4)...). PCRE2 supports relative group
+numbers such as +2 and -4 in all three cases. Perl supports both plus and minus
+for subroutine calls, but only minus for back references, and no relative
+numbering at all for conditions.
+
+20. Perl has different limits than PCRE2. See the +pcre2limit +documentation for details. Perl went with 5.10 from recursion to iteration +keeping the intermediate matches on the heap, which is ~10% slower but does not +fall into any stack-overflow limit. PCRE2 made a similar change at release +10.30, and also has many build-time and run-time customizable limits. +
++21. Unlike Perl, PCRE2 doesn't have character set modifiers and specially no way +to set characters by context just like Perl's "/d". A regular expression using +PCRE2_UTF and PCRE2_UCP will use similar rules to Perl's "/u"; something closer +to "/a" could be selected by adding other PCRE2_EXTRA_ASCII* options on top. +
++22. Some recursive patterns that Perl diagnoses as infinite recursions can be +handled by PCRE2, either by the interpreter or the JIT. An example is +/(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated +"abcd" substrings at the end of the subject. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 30 November 2023
+
+Copyright © 1997-2023 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2convert.html b/doc/html/pcre2convert.html new file mode 100644 index 0000000..6b9fea5 --- /dev/null +++ b/doc/html/pcre2convert.html @@ -0,0 +1,191 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+This document describes a set of functions that can be used to convert +"foreign" patterns into PCRE2 regular expressions. This facility is currently +experimental, and may be changed in future releases. Two kinds of pattern, +globs and POSIX patterns, are supported. +
+
+pcre2_convert_context *pcre2_convert_context_create(
+ pcre2_general_context *gcontext);
+
+
+pcre2_convert_context *pcre2_convert_context_copy(
+ pcre2_convert_context *cvcontext);
+
+
+void pcre2_convert_context_free(pcre2_convert_context *cvcontext);
+
+
+int pcre2_set_glob_escape(pcre2_convert_context *cvcontext,
+ uint32_t escape_char);
+
+
+int pcre2_set_glob_separator(pcre2_convert_context *cvcontext,
+ uint32_t separator_char);
+
+
+A convert context is used to hold parameters that affect the way that pattern
+conversion works. Like all PCRE2 contexts, you need to use a context only if
+you want to override the defaults. There are the usual create, copy, and free
+functions. If custom memory management functions are set in a general context
+that is passed to pcre2_convert_context_create(), they are used for all
+memory management within the conversion functions.
+
+There are only two parameters in the convert context at present. Both apply +only to glob conversions. The escape character defaults to grave accent under +Windows, otherwise backslash. It can be set to zero, meaning no escape +character, or to any punctuation character with a code point less than 256. +The separator character defaults to backslash under Windows, otherwise forward +slash. It can be set to forward slash, backslash, or dot. +
++The two setting functions return zero on success, or PCRE2_ERROR_BADDATA if +their second argument is invalid. +
+
+int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length,
+ uint32_t options, PCRE2_UCHAR **buffer,
+ PCRE2_SIZE *blength, pcre2_convert_context *cvcontext);
+
+
+void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern);
+
+
+The first two arguments of pcre2_pattern_convert() define the foreign
+pattern that is to be converted. The length may be given as
+PCRE2_ZERO_TERMINATED. The options argument defines how the pattern is to
+be processed. If the input is UTF, the PCRE2_CONVERT_UTF option should be set.
+PCRE2_CONVERT_NO_UTF_CHECK may also be set if you are sure the input is valid.
+One or more of the glob options, or one of the following POSIX options must be
+set to define the type of conversion that is required:
+
+ PCRE2_CONVERT_GLOB + PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + PCRE2_CONVERT_GLOB_NO_STARSTAR + PCRE2_CONVERT_POSIX_BASIC + PCRE2_CONVERT_POSIX_EXTENDED ++Details of the conversions are given below. The buffer and blength +arguments define how the output is handled: + +
+If buffer is NULL, the function just returns the length of the converted +pattern via blength. This is one less than the length of buffer needed, +because a terminating zero is always added to the output. +
++If buffer points to a NULL pointer, an output buffer is obtained using +the allocator in the context or malloc() if no context is supplied. A +pointer to this buffer is placed in the variable to which buffer points. +When no longer needed the output buffer must be freed by calling +pcre2_converted_pattern_free(). If this function is called with a NULL +argument, it returns immediately without doing anything. +
++If buffer points to a non-NULL pointer, blength must be set to the +actual length of the buffer provided (in code units). +
++In all cases, after successful conversion, the variable pointed to by +blength is updated to the length actually used (in code units), excluding +the terminating zero that is always added. +
++If an error occurs, the length (via blength) is set to the offset +within the input pattern where the error was detected. Only gross syntax errors +are caught; there are plenty of errors that will get passed on for +pcre2_compile() to discover. +
++The return from pcre2_pattern_convert() is zero on success or a non-zero +PCRE2 error code. Note that PCRE2 error codes may be positive or negative: +pcre2_compile() uses mostly positive codes and pcre2_match() +negative ones; pcre2_convert() uses existing codes of both kinds. A +textual error message can be obtained by calling +pcre2_get_error_message(). +
++Globs are used to match file names, and consequently have the concept of a +"path separator", which defaults to backslash under Windows and forward slash +otherwise. If PCRE2_CONVERT_GLOB is set, the wildcards * and ? are not +permitted to match separator characters, but the double-star (**) feature +(which does match separators) is supported. +
++PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to +match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with +the double-star feature disabled. These options may be given together. +
++POSIX defines two kinds of regular expression pattern: basic and extended. +These can be processed by setting PCRE2_CONVERT_POSIX_BASIC or +PCRE2_CONVERT_POSIX_EXTENDED, respectively. +
++In POSIX patterns, backslash is not special in a character class. Unmatched +closing parentheses are treated as literals. +
++In basic patterns, ? + | {} and () must be escaped to be recognized +as metacharacters outside a character class. If the first character in the +pattern is * it is treated as a literal. ^ is a metacharacter only at the start +of a branch. +
++In extended patterns, a backslash not in a character class always +makes the next character literal, whatever it is. There are no backreferences. +
++Note: POSIX mandates that the longest possible match at the first matching +position must be found. This is not what pcre2_match() does; it yields +the first match that is found. An application can use pcre2_dfa_match() +to find the longest match, but that does not support backreferences (but then +neither do POSIX extended patterns). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 28 June 2018
+
+Copyright © 1997-2018 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2demo.html b/doc/html/pcre2demo.html new file mode 100644 index 0000000..1cb7e0a --- /dev/null +++ b/doc/html/pcre2demo.html @@ -0,0 +1,518 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SOURCE CODE
+
+
+/*************************************************
+* PCRE2 DEMONSTRATION PROGRAM *
+*************************************************/
+
+/* This is a demonstration program to illustrate a straightforward way of
+using the PCRE2 regular expression library from a C program. See the
+pcre2sample documentation for a short discussion ("man pcre2sample" if you have
+the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
+incompatible with the original PCRE API.
+
+There are actually three libraries, each supporting a different code unit
+width. This demonstration program uses the 8-bit library. The default is to
+process each code unit as a separate character, but if the pattern begins with
+"(*UTF)", both it and the subject are treated as UTF-8 strings, where
+characters may occupy multiple code units.
+
+In Unix-like environments, if PCRE2 is installed in your standard system
+libraries, you should be able to compile this program using this command:
+
+cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
+
+If PCRE2 is not installed in a standard place, it is likely to be installed
+with support for the pkg-config mechanism. If you have pkg-config, you can
+compile this program using this command:
+
+cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
+
+If you do not have pkg-config, you may have to use something like this:
+
+cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
+ -R/usr/local/lib -lpcre2-8 -o pcre2demo
+
+Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
+library files for PCRE2 are installed on your system. Only some operating
+systems (Solaris is one) use the -R option.
+
+Building under Windows:
+
+If you want to statically link this program against a non-dll .a file, you must
+define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
+the following line. */
+
+/* #define PCRE2_STATIC */
+
+/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
+For a program that uses only one code unit width, setting it to 8, 16, or 32
+makes it possible to use generic function names such as pcre2_compile(). Note
+that just changing 8 to 16 (for example) is not sufficient to convert this
+program to process 16-bit characters. Even in a fully 16-bit environment, where
+string-handling functions such as strcmp() and printf() work with 16-bit
+characters, the code for handling the table of named substrings will still need
+to be modified. */
+
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <stdio.h>
+#include <string.h>
+#include <pcre2.h>
+
+
+/**************************************************************************
+* Here is the program. The API includes the concept of "contexts" for *
+* setting up unusual interface requirements for compiling and matching, *
+* such as custom memory managers and non-standard newline definitions. *
+* This program does not do any of this, so it makes no use of contexts, *
+* always passing NULL where a context could be given. *
+**************************************************************************/
+
+int main(int argc, char **argv)
+{
+pcre2_code *re;
+PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
+PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
+PCRE2_SPTR name_table;
+
+int crlf_is_newline;
+int errornumber;
+int find_all;
+int i;
+int rc;
+int utf8;
+
+uint32_t option_bits;
+uint32_t namecount;
+uint32_t name_entry_size;
+uint32_t newline;
+
+PCRE2_SIZE erroroffset;
+PCRE2_SIZE *ovector;
+PCRE2_SIZE subject_length;
+
+pcre2_match_data *match_data;
+
+
+/**************************************************************************
+* First, sort out the command line. There is only one possible option at *
+* the moment, "-g" to request repeated matching to find all occurrences, *
+* like Perl's /g option. We set the variable find_all to a non-zero value *
+* if the -g option is present. *
+**************************************************************************/
+
+find_all = 0;
+for (i = 1; i < argc; i++)
+ {
+ if (strcmp(argv[i], "-g") == 0) find_all = 1;
+ else if (argv[i][0] == '-')
+ {
+ printf("Unrecognised option %s\n", argv[i]);
+ return 1;
+ }
+ else break;
+ }
+
+/* After the options, we require exactly two arguments, which are the pattern,
+and the subject string. */
+
+if (argc - i != 2)
+ {
+ printf("Exactly two arguments required: a regex and a subject string\n");
+ return 1;
+ }
+
+/* Pattern and subject are char arguments, so they can be straightforwardly
+cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
+length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
+defined to be size_t. */
+
+pattern = (PCRE2_SPTR)argv[i];
+subject = (PCRE2_SPTR)argv[i+1];
+subject_length = (PCRE2_SIZE)strlen((char *)subject);
+
+
+/*************************************************************************
+* Now we are going to compile the regular expression pattern, and handle *
+* any errors that are detected. *
+*************************************************************************/
+
+re = pcre2_compile(
+ pattern, /* the pattern */
+ PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
+ 0, /* default options */
+ &errornumber, /* for error number */
+ &erroroffset, /* for error offset */
+ NULL); /* use default compile context */
+
+/* Compilation failed: print the error message and exit. */
+
+if (re == NULL)
+ {
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
+ printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
+ buffer);
+ return 1;
+ }
+
+
+/*************************************************************************
+* If the compilation succeeded, we call PCRE2 again, in order to do a *
+* pattern match against the subject string. This does just ONE match. If *
+* further matching is needed, it will be done below. Before running the *
+* match we must set up a match_data block for holding the result. Using *
+* pcre2_match_data_create_from_pattern() ensures that the block is *
+* exactly the right size for the number of capturing parentheses in the *
+* pattern. If you need to know the actual size of a match_data block as *
+* a number of bytes, you can find it like this: *
+* *
+* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); *
+*************************************************************************/
+
+match_data = pcre2_match_data_create_from_pattern(re, NULL);
+
+/* Now run the match. */
+
+rc = pcre2_match(
+ re, /* the compiled pattern */
+ subject, /* the subject string */
+ subject_length, /* the length of the subject */
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ match_data, /* block for storing the result */
+ NULL); /* use default match context */
+
+/* Matching failed: handle error cases */
+
+if (rc < 0)
+ {
+ switch(rc)
+ {
+ case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
+ /*
+ Handle other special cases if you like
+ */
+ default: printf("Matching error %d\n", rc); break;
+ }
+ pcre2_match_data_free(match_data); /* Release memory used for the match */
+ pcre2_code_free(re); /* data and the compiled pattern. */
+ return 1;
+ }
+
+/* Match succeeded. Get a pointer to the output vector, where string offsets
+are stored. */
+
+ovector = pcre2_get_ovector_pointer(match_data);
+printf("Match succeeded at offset %d\n", (int)ovector[0]);
+
+
+/*************************************************************************
+* We have found the first match within the subject string. If the output *
+* vector wasn't big enough, say so. Then output any substrings that were *
+* captured. *
+*************************************************************************/
+
+/* The output vector wasn't big enough. This should not happen, because we used
+pcre2_match_data_create_from_pattern() above. */
+
+if (rc == 0)
+ printf("ovector was not big enough for all the captured substrings\n");
+
+/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
+assertions. However, there is an option to re-enable the old behaviour. If that
+is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
+assertion to set the start of a match later than its end. In this demonstration
+program, we show how to detect this case, but it shouldn't arise because the
+option is never set. */
+
+if (ovector[0] > ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
+/* Show substrings stored in the output vector by number. Obviously, in a real
+application you might want to do things other than print them. */
+
+for (i = 0; i < rc; i++)
+ {
+ PCRE2_SPTR substring_start = subject + ovector[2*i];
+ PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
+ printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
+ }
+
+
+/**************************************************************************
+* That concludes the basic part of this demonstration program. We have *
+* compiled a pattern, and performed a single match. The code that follows *
+* shows first how to access named substrings, and then how to code for *
+* repeated matches on the same subject. *
+**************************************************************************/
+
+/* See if there are any named substrings, and if so, show them by name. First
+we have to extract the count of named parentheses from the pattern. */
+
+(void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
+ &namecount); /* where to put the answer */
+
+if (namecount == 0) printf("No named substrings\n"); else
+ {
+ PCRE2_SPTR tabptr;
+ printf("Named substrings\n");
+
+ /* Before we can access the substrings, we must extract the table for
+ translating names to numbers, and the size of each entry in the table. */
+
+ (void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMETABLE, /* address of the table */
+ &name_table); /* where to put the answer */
+
+ (void)pcre2_pattern_info(
+ re, /* the compiled pattern */
+ PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
+ &name_entry_size); /* where to put the answer */
+
+ /* Now we can scan the table and, for each entry, print the number, the name,
+ and the substring itself. In the 8-bit library the number is held in two
+ bytes, most significant first. */
+
+ tabptr = name_table;
+ for (i = 0; i < namecount; i++)
+ {
+ int n = (tabptr[0] << 8) | tabptr[1];
+ printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
+ (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
+ tabptr += name_entry_size;
+ }
+ }
+
+
+/*************************************************************************
+* If the "-g" option was given on the command line, we want to continue *
+* to search for additional matches in the subject string, in a similar *
+* way to the /g option in Perl. This turns out to be trickier than you *
+* might think because of the possibility of matching an empty string. *
+* What happens is as follows: *
+* *
+* If the previous match was NOT for an empty string, we can just start *
+* the next match at the end of the previous one. *
+* *
+* If the previous match WAS for an empty string, we can't do that, as it *
+* would lead to an infinite loop. Instead, a call of pcre2_match() is *
+* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
+* first of these tells PCRE2 that an empty string at the start of the *
+* subject is not a valid match; other possibilities must be tried. The *
+* second flag restricts PCRE2 to one match attempt at the initial string *
+* position. If this match succeeds, an alternative to the empty string *
+* match has been found, and we can print it and proceed round the loop, *
+* advancing by the length of whatever was found. If this match does not *
+* succeed, we still stay in the loop, advancing by just one character. *
+* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
+* more than one byte. *
+* *
+* However, there is a complication concerned with newlines. When the *
+* newline convention is such that CRLF is a valid newline, we must *
+* advance by two characters rather than one. The newline convention can *
+* be set in the regex by (*CR), etc.; if not, we must find the default. *
+*************************************************************************/
+
+if (!find_all) /* Check for -g */
+ {
+ pcre2_match_data_free(match_data); /* Release the memory that was used */
+ pcre2_code_free(re); /* for the match data and the pattern. */
+ return 0; /* Exit the program. */
+ }
+
+/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
+sequence. First, find the options with which the regex was compiled and extract
+the UTF state. */
+
+(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits);
+utf8 = (option_bits & PCRE2_UTF) != 0;
+
+/* Now find the newline convention and see whether CRLF is a valid newline
+sequence. */
+
+(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline);
+crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
+ newline == PCRE2_NEWLINE_CRLF ||
+ newline == PCRE2_NEWLINE_ANYCRLF;
+
+/* Loop for second and subsequent matches */
+
+for (;;)
+ {
+ uint32_t options = 0; /* Normally no options */
+ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
+
+ /* If the previous match was for an empty string, we are finished if we are
+ at the end of the subject. Otherwise, arrange to run another match at the
+ same point to see if a non-empty match can be found. */
+
+ if (ovector[0] == ovector[1])
+ {
+ if (ovector[0] == subject_length) break;
+ options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
+ }
+
+ /* If the previous match was not an empty string, there is one tricky case to
+ consider. If a pattern contains \K within a lookbehind assertion at the
+ start, the end of the matched string can be at the offset where the match
+ started. Without special action, this leads to a loop that keeps on matching
+ the same substring. We must detect this case and arrange to move the start on
+ by one character. The pcre2_get_startchar() function returns the starting
+ offset that was passed to pcre2_match(). */
+
+ else
+ {
+ PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
+ if (start_offset <= startchar)
+ {
+ if (startchar >= subject_length) break; /* Reached end of subject. */
+ start_offset = startchar + 1; /* Advance by one character. */
+ if (utf8) /* If UTF-8, it may be more */
+ { /* than one code unit. */
+ for (; start_offset < subject_length; start_offset++)
+ if ((subject[start_offset] & 0xc0) != 0x80) break;
+ }
+ }
+ }
+
+ /* Run the next matching operation */
+
+ rc = pcre2_match(
+ re, /* the compiled pattern */
+ subject, /* the subject string */
+ subject_length, /* the length of the subject */
+ start_offset, /* starting offset in the subject */
+ options, /* options */
+ match_data, /* block for storing the result */
+ NULL); /* use default match context */
+
+ /* This time, a result of NOMATCH isn't an error. If the value in "options"
+ is zero, it just means we have found all possible matches, so the loop ends.
+ Otherwise, it means we have failed to find a non-empty-string match at a
+ point where there was a previous empty-string match. In this case, we do what
+ Perl does: advance the matching position by one character, and continue. We
+ do this by setting the "end of previous match" offset, because that is picked
+ up at the top of the loop as the point at which to start again.
+
+ There are two complications: (a) When CRLF is a valid newline sequence, and
+ the current position is just before it, advance by an extra byte. (b)
+ Otherwise we must ensure that we skip an entire UTF character if we are in
+ UTF mode. */
+
+ if (rc == PCRE2_ERROR_NOMATCH)
+ {
+ if (options == 0) break; /* All matches found */
+ ovector[1] = start_offset + 1; /* Advance one code unit */
+ if (crlf_is_newline && /* If CRLF is a newline & */
+ start_offset < subject_length - 1 && /* we are at CRLF, */
+ subject[start_offset] == '\r' &&
+ subject[start_offset + 1] == '\n')
+ ovector[1] += 1; /* Advance by one more. */
+ else if (utf8) /* Otherwise, ensure we */
+ { /* advance a whole UTF-8 */
+ while (ovector[1] < subject_length) /* character. */
+ {
+ if ((subject[ovector[1]] & 0xc0) != 0x80) break;
+ ovector[1] += 1;
+ }
+ }
+ continue; /* Go round the loop again */
+ }
+
+ /* Other matching errors are not recoverable. */
+
+ if (rc < 0)
+ {
+ printf("Matching error %d\n", rc);
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
+ /* Match succeeded */
+
+ printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
+
+ /* The match succeeded, but the output vector wasn't big enough. This
+ should not happen. */
+
+ if (rc == 0)
+ printf("ovector was not big enough for all the captured substrings\n");
+
+ /* We must guard against patterns such as /(?=.\K)/ that use \K in an
+ assertion to set the start of a match later than its end. In this
+ demonstration program, we just detect this case and give up. */
+
+ if (ovector[0] > ovector[1])
+ {
+ printf("\\K was used in an assertion to set the match start after its end.\n"
+ "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
+ (char *)(subject + ovector[1]));
+ printf("Run abandoned\n");
+ pcre2_match_data_free(match_data);
+ pcre2_code_free(re);
+ return 1;
+ }
+
+ /* As before, show substrings stored in the output vector by number, and then
+ also any named substrings. */
+
+ for (i = 0; i < rc; i++)
+ {
+ PCRE2_SPTR substring_start = subject + ovector[2*i];
+ size_t substring_length = ovector[2*i+1] - ovector[2*i];
+ printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
+ }
+
+ if (namecount == 0) printf("No named substrings\n"); else
+ {
+ PCRE2_SPTR tabptr = name_table;
+ printf("Named substrings\n");
+ for (i = 0; i < namecount; i++)
+ {
+ int n = (tabptr[0] << 8) | tabptr[1];
+ printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
+ (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
+ tabptr += name_entry_size;
+ }
+ }
+ } /* End of loop to find second and subsequent matches */
+
+printf("\n");
+pcre2_match_data_free(match_data);
+pcre2_code_free(re);
+return 0;
+}
+
+/* End of pcre2demo.c */
+
+Return to the PCRE2 index page.
+
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
new file mode 100644
index 0000000..bd12246
--- /dev/null
+++ b/doc/html/pcre2grep.html
@@ -0,0 +1,1125 @@
+
+
+pcre2grep specification
+
+
+pcre2grep man page
+
+Return to the PCRE2 index page.
+
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+pcre2grep [options] [long options] [pattern] [path1 path2 ...] +
++pcre2grep searches files for character patterns, in the same way as other +grep commands do, but it uses the PCRE2 regular expression library to support +patterns that are compatible with the regular expressions of Perl 5. See +pcre2syntax(3) +for a quick-reference summary of pattern syntax, or +pcre2pattern(3) +for a full description of the syntax and semantics of the regular expressions +that PCRE2 supports. +
++Patterns, whether supplied on the command line or in a separate file, are given +without delimiters. For example: +
+ pcre2grep Thursday /etc/motd ++If you attempt to use delimiters (for example, by surrounding a pattern with +slashes, as is common in Perl scripts), they are interpreted as part of the +pattern. Quotes can of course be used to delimit patterns on the command line +because they are interpreted by the shell, and indeed quotes are required if a +pattern contains white space or shell metacharacters. + +
+The first argument that follows any option settings is treated as the single +pattern to be matched when neither -e nor -f is present. +Conversely, when one or both of these options are used to specify patterns, all +arguments are treated as path names. At least one of -e, -f, or an +argument pattern must be provided. +
++If no files are specified, pcre2grep reads the standard input. The +standard input can also be referenced by a name consisting of a single hyphen. +For example: +
+ pcre2grep some-pattern file1 - file3 ++By default, input files are searched line by line, so pattern assertions about +the beginning and end of a subject string (^, $, \A, \Z, and \z) match at +the beginning and end of each line. When a line matches a pattern, it is copied +to the standard output, and if there is more than one file, the file name is +output at the start of each line, followed by a colon. However, there are +options that can change how pcre2grep behaves. For example, the -M +option makes it possible to search for strings that span line boundaries. What +defines a line boundary is controlled by the -N (--newline) option. +The -h and -H options control whether or not file names are shown, +and the -Z option changes the file name terminator to a zero byte. + +
+The amount of memory used for buffering files that are being scanned is +controlled by parameters that can be set by the --buffer-size and +--max-buffer-size options. The first of these sets the size of buffer +that is obtained at the start of processing. If an input file contains very +long lines, a larger buffer may be needed; this is handled by automatically +extending the buffer, up to the limit specified by --max-buffer-size. The +default values for these parameters can be set when pcre2grep is +built; if nothing is specified, the defaults are set to 20KiB and 1MiB +respectively. An error occurs if a line is too long and the buffer can no +longer be expanded. +
++The block of memory that is actually used is three times the "buffer size", to +allow for buffering "before" and "after" lines. If the buffer size is too +small, fewer than requested "before" and "after" lines may be output. +
++When matching with a multiline pattern, the size of the buffer must be at least +half of the maximum match expected or the pattern might fail to match. +
++Patterns can be no longer than 8KiB or BUFSIZ bytes, whichever is the greater. +BUFSIZ is defined in <stdio.h>. When there is more than one pattern +(specified by the use of -e and/or -f), each pattern is applied to +each line in the order in which they are defined, except that all the -e +patterns are tried before the -f patterns. +
++By default, as soon as one pattern matches a line, no further patterns are +considered. However, if --colour (or --color) is used to colour the +matching substrings, or if --only-matching, --file-offsets, +--line-offsets, or --output is used to output only the part of the +line that matched (either shown literally, or as an offset), the behaviour is +different. In this situation, all the patterns are applied to the line. If +there is more than one match, the one that begins nearest to the start of the +subject is processed; if there is more than one match at that position, the one +with the longest matching substring is processed; if the matching substrings +are equal, the first match found is processed. +
++Scanning with all the patterns resumes immediately following the match, so that +later matches on the same line can be found. Note, however, that an overlapping +match that starts in the middle of another match will not be processed. +
++The above behaviour was changed at release 10.41 to be more compatible with GNU +grep. In earlier releases, pcre2grep did not recognize matches from +later patterns that were earlier in the subject. +
++Patterns that can match an empty string are accepted, but empty string +matches are never recognized. An example is the pattern "(super)?(man)?", in +which all components are optional. This pattern finds all occurrences of both +"super" and "man"; the output differs from matching with "super|man" when only +the matching substrings are being shown. +
++If the LC_ALL or LC_CTYPE environment variable is set, +pcre2grep uses the value to set a locale when calling the PCRE2 library. +The --locale option can be used to override this. +
++Compile-time options for pcre2grep can set it up to use libz or +libbz2 for reading compressed files whose names end in .gz or +.bz2, respectively. You can find out whether your pcre2grep binary +has support for one or both of these file types by running it with the +--help option. If the appropriate support is not present, all files are +treated as plain text. The standard input is always so treated. If a file with +a .gz or .bz2 extension is not in fact compressed, it is read as a +plain text file. When input is from a compressed .gz or .bz2 file, the +--line-buffered option is ignored. +
++By default, a file that contains a binary zero byte within the first 1024 bytes +is identified as a binary file, and is processed specially. However, if the +newline type is specified as NUL, that is, the line terminator is a binary +zero, the test for a binary file is not applied. See the --binary-files +option for a means of changing the way binary files are handled. +
++Patterns passed from the command line are strings that are terminated by a +binary zero, so cannot contain internal zeros. However, patterns that are read +from a file via the -f option may contain binary zeros. +
++The order in which some of the options appear can affect the output. For +example, both the -H and -l options affect the printing of file +names. Whichever comes later in the command line will be the one that takes +effect. Similarly, except where noted below, if an option is given twice, the +later setting is used. Numerical values for options may be followed by K or M, +to signify multiplication by 1024 or 1024*1024 respectively. +
++-- +This terminates the list of options. It is useful if the next item on the +command line starts with a hyphen but is not an option. This allows for the +processing of patterns and file names that start with hyphens. +
++-A number, --after-context=number +Output up to number lines of context after each matching line. Fewer +lines are output if the next match or the end of the file is reached, or if the +processing buffer size has been set too small. If file names and/or line +numbers are being output, a hyphen separator is used instead of a colon for the +context lines (the -Z option can be used to change the file name +terminator to a zero byte). A line containing "--" is output between each group +of lines, unless they are in fact contiguous in the input file. The value of +number is expected to be relatively small. When -c is used, +-A is ignored. +
++-a, --text +Treat binary files as text. This is equivalent to +--binary-files=text. +
++--allow-lookaround-bsk +PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl. +This option causes pcre2grep to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK +option, which enables this somewhat dangerous usage. +
++-B number, --before-context=number +Output up to number lines of context before each matching line. Fewer +lines are output if the previous match or the start of the file is within +number lines, or if the processing buffer size has been set too small. If +file names and/or line numbers are being output, a hyphen separator is used +instead of a colon for the context lines (the -Z option can be used to +change the file name terminator to a zero byte). A line containing "--" is +output between each group of lines, unless they are in fact contiguous in the +input file. The value of number is expected to be relatively small. When +-c is used, -B is ignored. +
++--binary-files=word +Specify how binary files are to be processed. If the word is "binary" (the +default), pattern matching is performed on binary files, but the only output is +"Binary file <name> matches" when a match succeeds. If the word is "text", +which is equivalent to the -a or --text option, binary files are +processed in the same way as any other file. In this case, when a match +succeeds, the output may be binary garbage, which can have nasty effects if +sent to a terminal. If the word is "without-match", which is equivalent to the +-I option, binary files are not processed at all; they are assumed not to +be of interest and are skipped without causing any output or affecting the +return code. +
++--buffer-size=number +Set the parameter that controls how much memory is obtained at the start of +processing for buffering files that are being scanned. See also +--max-buffer-size below. +
++-C number, --context=number +Output number lines of context both before and after each matching line. +This is equivalent to setting both -A and -B to the same value. +
+
+-c, --count
+Do not output lines from the files that are being scanned; instead output the
+number of lines that would have been shown, either because they matched, or, if
+-v is set, because they failed to match. By default, this count is
+exactly the same as the number of lines that would have been output, but if the
+-M (multiline) option is used (without -v), there may be more
+suppressed lines than the count (that is, the number of matches).
+
+
+If no lines are selected, the number zero is output. If several files are
+being scanned, a count is output for each of them and the -t option can
+be used to cause a total to be output at the end. However, if the
+--files-with-matches option is also used, only those files whose counts
+are greater than zero are listed. When -c is used, the -A,
+-B, and -C options are ignored.
+
+--colour, --color +If this option is given without any data, it is equivalent to "--colour=auto". +If data is required, it must be given in the same shell item, separated by an +equals sign. +
+
+--colour=value, --color=value
+This option specifies under what circumstances the parts of a line that matched
+a pattern should be coloured in the output. It is ignored if
+--file-offsets, --line-offsets, or --output is set. By
+default, output is not coloured. The value for the --colour option (which
+is optional, see above) may be "never", "always", or "auto". In the latter
+case, colouring happens only if the standard output is connected to a terminal.
+More resources are used when colouring is enabled, because pcre2grep has
+to search for all possible matches in a line, not just one, in order to colour
+them all.
+
+
+The colour that is used can be specified by setting one of the environment
+variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or
+PCREGREP_COLOR, which are checked in that order. If none of these are set,
+pcre2grep looks for GREP_COLORS or GREP_COLOR (in that order). The value
+of the variable should be a string of two numbers, separated by a semicolon,
+except in the case of GREP_COLORS, which must start with "ms=" or "mt="
+followed by two semicolon-separated colours, terminated by the end of the
+string or by a colon. If GREP_COLORS does not start with "ms=" or "mt=" it is
+ignored, and GREP_COLOR is checked.
+
+
+If the string obtained from one of the above variables contains any characters
+other than semicolon or digits, the setting is ignored and the default colour
+is used. The string is copied directly into the control string for setting
+colour on a terminal, so it is your responsibility to ensure that the values
+make sense. If no relevant environment variable is set, the default is "1;31",
+which gives red.
+
+-D action, --devices=action +If an input path is not a regular file or a directory, "action" specifies how +it is to be processed. Valid values are "read" (the default) or "skip" +(silently skip the path). +
++-d action, --directories=action +If an input path is a directory, "action" specifies how it is to be processed. +Valid values are "read" (the default in non-Windows environments, for +compatibility with GNU grep), "recurse" (equivalent to the -r option), or +"skip" (silently skip the path, the default in Windows environments). In the +"read" case, directories are read as if they were ordinary files. In some +operating systems the effect of reading a directory like this is an immediate +end-of-file; in others it may provoke an error. +
++--depth-limit=number +See --match-limit below. +
++-E, --case-restrict +When case distinctions are being ignored in Unicode mode, two ASCII letters (K +and S) will by default match Unicode characters U+212A (Kelvin sign) and U+017F +(long S) respectively, as well as their lower case ASCII counterparts. When +this option is set, case equivalences are restricted such that no ASCII +character matches a non-ASCII character, and vice versa. +
+
+-e pattern, --regex=pattern, --regexp=pattern
+Specify a pattern to be matched. This option can be used multiple times in
+order to specify several patterns. It can also be used as a way of specifying a
+single pattern that starts with a hyphen. When -e is used, no argument
+pattern is taken from the command line; all arguments are treated as file
+names. There is no limit to the number of patterns. They are applied to each
+line in the order in which they are defined.
+
+
+If -f is used with -e, the command line patterns are matched first,
+followed by the patterns from the file(s), independent of the order in which
+these options are specified.
+
+--exclude=pattern +Files (but not directories) whose names match the pattern are skipped without +being processed. This applies to all files, whether listed on the command line, +obtained from --file-list, or by scanning a directory. The pattern is a +PCRE2 regular expression, and is matched against the final component of the +file name, not the entire path. The -F, -w, and -x options do +not apply to this pattern. The option may be given any number of times in order +to specify multiple patterns. If a file name matches both an --include +and an --exclude pattern, it is excluded. There is no short form for this +option. +
++--exclude-from=filename +Treat each non-empty line of the file as the data for an --exclude +option. What constitutes a newline when reading the file is the operating +system's default. The --newline option has no effect on this option. This +option may be given more than once in order to specify a number of files to +read. +
++--exclude-dir=pattern +Directories whose names match the pattern are skipped without being processed, +whatever the setting of the --recursive option. This applies to all +directories, whether listed on the command line, obtained from +--file-list, or by scanning a parent directory. The pattern is a PCRE2 +regular expression, and is matched against the final component of the directory +name, not the entire path. The -F, -w, and -x options do not +apply to this pattern. The option may be given any number of times in order to +specify more than one pattern. If a directory matches both --include-dir +and --exclude-dir, it is excluded. There is no short form for this +option. +
++-F, --fixed-strings +Interpret each data-matching pattern as a list of fixed strings, separated by +newlines, instead of as a regular expression. What constitutes a newline for +this purpose is controlled by the --newline option. The -w (match +as a word) and -x (match whole line) options can be used with -F. +They apply to each of the fixed strings. A line is selected if any of the fixed +strings are found in it (subject to -w or -x, if present). This +option applies only to the patterns that are matched against the contents of +files; it does not apply to patterns specified by any of the --include or +--exclude options. +
+
+-f filename, --file=filename
+Read patterns from the file, one per line. As is the case with patterns on the
+command line, no delimiters should be used. What constitutes a newline when
+reading the file is the operating system's default interpretation of \n. The
+--newline option has no effect on this option. Trailing white space is
+removed from each line, and blank lines are ignored. An empty file contains no
+patterns and therefore matches nothing. Patterns read from a file in this way
+may contain binary zeros, which are treated as ordinary data characters.
+
+
+If this option is given more than once, all the specified files are read. A
+data line is output if any of the patterns match it. A file name can be given
+as "-" to refer to the standard input. When -f is used, patterns
+specified on the command line using -e may also be present; they are
+matched before the file's patterns. However, no pattern is taken from the
+command line; all arguments are treated as the names of paths to be searched.
+
+--file-list=filename +Read a list of files and/or directories that are to be scanned from the given +file, one per line. What constitutes a newline when reading the file is the +operating system's default. Trailing white space is removed from each line, and +blank lines are ignored. These paths are processed before any that are listed +on the command line. The file name can be given as "-" to refer to the standard +input. If --file and --file-list are both specified as "-", +patterns are read first. This is useful only when the standard input is a +terminal, from which further lines (the list of files) can be read after an +end-of-file indication. If this option is given more than once, all the +specified files are read. +
++--file-offsets +Instead of showing lines or parts of lines that match, show each match as an +offset from the start of the file and a length, separated by a comma. In this +mode, --colour has no effect, and no context is shown. That is, the +-A, -B, and -C options are ignored. If there is more than one +match in a line, each of them is shown separately. This option is mutually +exclusive with --output, --line-offsets, and --only-matching. +
++--group-separator=text +Output this text string instead of two hyphens between groups of lines when +-A, -B, or -C is in use. See also --no-group-separator. +
++-H, --with-filename +Force the inclusion of the file name at the start of output lines when +searching a single file. The file name is not normally shown in this case. +By default, for matching lines, the file name is followed by a colon; for +context lines, a hyphen separator is used. The -Z option can be used to +change the terminator to a zero byte. If a line number is also being output, +it follows the file name. When the -M option causes a pattern to match +more than one line, only the first is preceded by the file name. This option +overrides any previous -h, -l, or -L options. +
++-h, --no-filename +Suppress the output file names when searching multiple files. File names are +normally shown when multiple files are searched. By default, for matching +lines, the file name is followed by a colon; for context lines, a hyphen +separator is used. The -Z option can be used to change the terminator to +a zero byte. If a line number is also being output, it follows the file name. +This option overrides any previous -H, -L, or -l options. +
++--heap-limit=number +See --match-limit below. +
++--help +Output a help message, giving brief details of the command options and file +type support, and then exit. Anything else on the command line is +ignored. +
++-I +Ignore binary files. This is equivalent to +--binary-files=without-match. +
++-i, --ignore-case +Ignore upper/lower case distinctions when pattern matching. This applies when +matching path names for inclusion or exclusion as well as when matching lines +in files. +
++--include=pattern +If any --include patterns are specified, the only files that are +processed are those whose names match one of the patterns and do not match an +--exclude pattern. This option does not affect directories, but it +applies to all files, whether listed on the command line, obtained from +--file-list, or by scanning a directory. The pattern is a PCRE2 regular +expression, and is matched against the final component of the file name, not +the entire path. The -F, -w, and -x options do not apply to +this pattern. The option may be given any number of times. If a file name +matches both an --include and an --exclude pattern, it is excluded. +There is no short form for this option. +
++--include-from=filename +Treat each non-empty line of the file as the data for an --include +option. What constitutes a newline for this purpose is the operating system's +default. The --newline option has no effect on this option. This option +may be given any number of times; all the files are read. +
++--include-dir=pattern +If any --include-dir patterns are specified, the only directories that +are processed are those whose names match one of the patterns and do not match +an --exclude-dir pattern. This applies to all directories, whether listed +on the command line, obtained from --file-list, or by scanning a parent +directory. The pattern is a PCRE2 regular expression, and is matched against +the final component of the directory name, not the entire path. The -F, +-w, and -x options do not apply to this pattern. The option may be +given any number of times. If a directory matches both --include-dir and +--exclude-dir, it is excluded. There is no short form for this option. +
++-L, --files-without-match +Instead of outputting lines from the files, just output the names of the files +that do not contain any lines that would have been output. Each file name is +output once, on a separate line by default, but if the -Z option is set, +they are separated by zero bytes instead of newlines. This option overrides any +previous -H, -h, or -l options. +
++-l, --files-with-matches +Instead of outputting lines from the files, just output the names of the files +containing lines that would have been output. Each file name is output once, on +a separate line, but if the -Z option is set, they are separated by zero +bytes instead of newlines. Searching normally stops as soon as a matching line +is found in a file. However, if the -c (count) option is also used, +matching continues in order to obtain the correct count, and those files that +have at least one match are listed along with their counts. Using this option +with -c is a way of suppressing the listing of files with no matches that +occurs with -c on its own. This option overrides any previous -H, +-h, or -L options. +
++--label=name +This option supplies a name to be used for the standard input when file names +are being output. If not supplied, "(standard input)" is used. There is no +short form for this option. +
++--line-buffered +When this option is given, non-compressed input is read and processed line by +line, and the output is flushed after each write. By default, input is read in +large chunks, unless pcre2grep can determine that it is reading from a +terminal, which is currently possible only in Unix-like environments or +Windows. Output to terminal is normally automatically flushed by the operating +system. This option can be useful when the input or output is attached to a +pipe and you do not want pcre2grep to buffer up large amounts of data. +However, its use will affect performance, and the -M (multiline) option +ceases to work. When input is from a compressed .gz or .bz2 file, +--line-buffered is ignored. +
++--line-offsets +Instead of showing lines or parts of lines that match, show each match as a +line number, the offset from the start of the line, and a length. The line +number is terminated by a colon (as usual; see the -n option), and the +offset and length are separated by a comma. In this mode, --colour has no +effect, and no context is shown. That is, the -A, -B, and -C +options are ignored. If there is more than one match in a line, each of them is +shown separately. This option is mutually exclusive with --output, +--file-offsets, and --only-matching. +
++--locale=locale-name +This option specifies a locale to be used for pattern matching. It overrides +the value in the LC_ALL or LC_CTYPE environment variables. If no +locale is specified, the PCRE2 library's default (usually the "C" locale) is +used. There is no short form for this option. +
+
+-M, --multiline
+Allow patterns to match more than one line. When this option is set, the PCRE2
+library is called in "multiline" mode, and a match is allowed to continue past
+the end of the initial line and onto one or more subsequent lines.
+
+
+Patterns used with -M may usefully contain literal newline characters and
+internal occurrences of ^ and $ characters, because in multiline mode these can
+match at internal newlines. Because pcre2grep is scanning multiple lines,
+the \Z and \z assertions match only at the end of the last line in the file.
+The \A assertion matches at the start of the first line of a match. This can
+be any line in the file; it is not anchored to the first line.
+
+
+The output for a successful match may consist of more than one line. The first
+line is the line in which the match started, and the last line is the line in
+which the match ended. If the matched string ends with a newline sequence, the
+output ends at the end of that line. If -v is set, none of the lines in a
+multi-line match are output. Once a match has been handled, scanning restarts
+at the beginning of the line after the one in which the match ended.
+
+
+The newline sequence that separates multiple lines must be matched as part of
+the pattern. For example, to find the phrase "regular expression" in a file
+where "regular" might be at the end of a line and "expression" at the start of
+the next line, you could use this command:
+
+ pcre2grep -M 'regular\s+expression' <file> ++The \s escape sequence matches any white space character, including newlines, +and is followed by + so as to match trailing white space on the first line as +well as possibly handling a two-character newline sequence. +
+-m number, --max-count=number +Stop processing after finding number matching lines, or non-matching +lines if -v is also set. Any trailing context lines are output after the +final match. In multiline mode, each multiline match counts as just one line +for this purpose. If this limit is reached when reading the standard input from +a regular file, the file is left positioned just after the last matching line. +If -c is also set, the count that is output is never greater than +number. This option has no effect if used with -L, -l, or +-q, or when just checking for a match in a binary file. +
+
+--match-limit=number
+Processing some regular expression patterns may take a very long time to search
+for all possible matching strings. Others may require a very large amount of
+memory. There are three options that set resource limits for matching.
+
+
+The --match-limit option provides a means of limiting computing resource
+usage when processing patterns that are not going to match, but which have a
+very large number of possibilities in their search trees. The classic example
+is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
+counter that is incremented each time around its main processing loop. If the
+value set by --match-limit is reached, an error occurs.
+
+
+The --heap-limit option specifies, as a number of kibibytes (units of
+1024 bytes), the maximum amount of heap memory that may be used for matching.
+
+
+The --depth-limit option limits the depth of nested backtracking points,
+which indirectly limits the amount of memory that is used. The amount of memory
+needed for each backtracking point depends on the number of capturing
+parentheses in the pattern, so the amount of memory that is used before this
+limit acts varies from pattern to pattern. This limit is of use only if it is
+set smaller than --match-limit.
+
+
+There are no short forms for these options. The default limits can be set
+when the PCRE2 library is compiled; if they are not specified, the defaults
+are very large and so effectively unlimited.
+
+--max-buffer-size=number +This limits the expansion of the processing buffer, whose initial size can be +set by --buffer-size. The maximum buffer size is silently forced to be no +smaller than the starting buffer size. +
++-N newline-type, --newline=newline-type +Six different conventions for indicating the ends of lines in scanned files are +supported. For example: +
+ pcre2grep -N CRLF 'some pattern' <file> ++The newline type may be specified in upper, lower, or mixed case. If the +newline type is NUL, lines are separated by binary zero characters. The other +types are the single-character sequences CR (carriage return) and LF +(linefeed), the two-character sequence CRLF, an "anycrlf" type, which +recognizes any of the preceding three types, and an "any" type, for which any +Unicode line ending sequence is assumed to end a line. The Unicode sequences +are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed, +U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS +(paragraph separator, U+2029). +
+-n, --line-number +Precede each output line by its line number in the file, followed by a colon +for matching lines or a hyphen for context lines. If the file name is also +being output, it precedes the line number. When the -M option causes a +pattern to match more than one line, only the first is preceded by its line +number. This option is forced if --line-offsets is used. +
++--no-group-separator +Do not output a separator between groups of lines when -A, -B, or +-C is in use. The default is to output a line containing two hyphens. See +also --group-separator. +
++--no-jit +If the PCRE2 library is built with support for just-in-time compiling (which +speeds up matching), pcre2grep automatically makes use of this, unless it +was explicitly disabled at build time. This option can be used to disable the +use of JIT at run time. It is provided for testing and working around problems. +It should never be needed in normal use. +
+
+-O text, --output=text
+When there is a match, instead of outputting the line that matched, output just
+the text specified in this option, followed by an operating-system standard
+newline. In this mode, --colour has no effect, and no context is shown.
+That is, the -A, -B, and -C options are ignored. The
+--newline option has no effect on this option, which is mutually
+exclusive with --only-matching, --file-offsets, and
+--line-offsets. However, like --only-matching, if there is more
+than one match in a line, each of them causes a line of output.
+
+
+Escape sequences starting with a dollar character may be used to insert the
+contents of the matched part of the line and/or captured substrings into the
+text.
+
+
+$<digits> or ${<digits>} is replaced by the captured substring of the given
+decimal number; zero substitutes the whole match. If the number is greater than
+the number of capturing substrings, or if the capture is unset, the replacement
+is empty.
+
+
+$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
+newline; $r by carriage return; $t by tab; $v by vertical tab.
+
+
+$o<digits> or $o{<digits>} is replaced by the character whose code point is the
+given octal number. In the first form, up to three octal digits are processed.
+When more digits are needed in Unicode mode to specify a wide character, the
+second form must be used.
+
+
+$x<digits> or $x{<digits>} is replaced by the character represented by the
+given hexadecimal number. In the first form, up to two hexadecimal digits are
+processed. When more digits are needed in Unicode mode to specify a wide
+character, the second form must be used.
+
+
+Any other character is substituted by itself. In particular, $$ is replaced by
+a single dollar.
+
+-o, --only-matching +Show only the part of the line that matched a pattern instead of the whole +line. In this mode, no context is shown. That is, the -A, -B, and +-C options are ignored. If there is more than one match in a line, each +of them is shown separately, on a separate line of output. If -o is +combined with -v (invert the sense of the match to find non-matching +lines), no output is generated, but the return code is set appropriately. If +the matched portion of the line is empty, nothing is output unless the file +name or line number are being printed, in which case they are shown on an +otherwise empty line. This option is mutually exclusive with --output, +--file-offsets and --line-offsets. +
+
+-onumber, --only-matching=number
+Show only the part of the line that matched the capturing parentheses of the
+given number. Up to 50 capturing parentheses are supported by default. This
+limit can be changed via the --om-capture option. A pattern may contain
+any number of capturing parentheses, but only those whose number is within the
+limit can be accessed by -o. An error occurs if the number specified by
+-o is greater than the limit.
+
+
+-o0 is the same as -o without a number. Because these options can be
+given without an argument (see above), if an argument is present, it must be
+given in the same shell item, for example, -o3 or --only-matching=2. The
+comments given for the non-argument case above also apply to this option. If
+the specified capturing parentheses do not exist in the pattern, or were not
+set in the match, nothing is output unless the file name or line number are
+being output.
+
+
+If this option is given multiple times, multiple substrings are output for each
+match, in the order the options are given, and all on one line. For example,
+-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
+then 3 again to be output. By default, there is no separator (but see the next
+but one option).
+
+--om-capture=number +Set the number of capturing parentheses that can be accessed by -o. The +default is 50. +
++--om-separator=text +Specify a separating string for multiple occurrences of -o. The default +is an empty string. Separating strings are never coloured. +
++-P, --no-ucp +Starting from release 10.43, when UTF/Unicode mode is specified with -u +or -U, the PCRE2_UCP option is used by default. This means that the +POSIX classes in patterns match more than just ASCII characters. For example, +[:digit:] matches any Unicode decimal digit. The --no-ucp option +suppresses PCRE2_UCP, thus restricting the POSIX classes to ASCII characters, +as was the case in earlier releases. Note that there are now more fine-grained +option settings within patterns that affect individual classes. For example, +when in UCP mode, the sequence (?aP) restricts [:word:] to ASCII letters, while +allowing \w to match Unicode letters and digits. +
++-q, --quiet +Work quietly, that is, display nothing except error messages. The exit +status indicates whether or not any matches were found. +
++-r, --recursive +If any given path is a directory, recursively scan the files it contains, +taking note of any --include and --exclude settings. By default, a +directory is read as a normal file; in some operating systems this gives an +immediate end-of-file. This option is a shorthand for setting the -d +option to "recurse". +
++--recursion-limit=number +This is an obsolete synonym for --depth-limit. See --match-limit +above for details. +
++-s, --no-messages +Suppress error messages about non-existent or unreadable files. Such files are +quietly skipped. However, the return code is still 2, even if matches were +found in other files. +
++-t, --total-count +This option is useful when scanning more than one file. If used on its own, +-t suppresses all output except for a grand total number of matching +lines (or non-matching lines if -v is used) in all the files. If -t +is used with -c, a grand total is output except when the previous output +is just one line. In other words, it is not output when just one file's count +is listed. If file names are being output, the grand total is preceded by +"TOTAL:". Otherwise, it appears as just another number. The -t option is +ignored when used with -L (list files without matches), because the grand +total would always be zero. +
++-u, --utf +Operate in UTF/Unicode mode. This option is available only if PCRE2 has been +compiled with UTF-8 support. All patterns (including those for any +--exclude and --include options) and all lines that are scanned +must be valid strings of UTF-8 characters. If an invalid UTF-8 string is +encountered, an error occurs. +
++-U, --utf-allow-invalid +As --utf, but in addition subject lines may contain invalid UTF-8 code +unit sequences. These can never form part of any pattern match. Patterns +themselves, however, must still be valid UTF-8 strings. This facility allows +valid UTF-8 strings to be sought within arbitrary byte sequences in executable +or other binary files. For more details about matching in non-valid UTF-8 +strings, see the +pcre2unicode(3) +documentation. +
++-V, --version +Write the version numbers of pcre2grep and the PCRE2 library to the +standard output and then exit. Anything else on the command line is +ignored. +
++-v, --invert-match +Invert the sense of the match, so that lines which do not match any of +the patterns are the ones that are found. When this option is set, options such +as --only-matching and --output, which specify parts of a match +that are to be output, are ignored. +
++-w, --word-regex, --word-regexp +Force the patterns only to match "words". That is, there must be a word +boundary at the start and end of each matched string. This is equivalent to +having "\b(?:" at the start of each pattern, and ")\b" at the end. This +option applies only to the patterns that are matched against the contents of +files; it does not apply to patterns specified by any of the --include or +--exclude options. +
++-x, --line-regex, --line-regexp +Force the patterns to start matching only at the beginnings of lines, and in +addition, require them to match entire lines. In multiline mode the match may +be more than one line. This is equivalent to having "^(?:" at the start of each +pattern and ")$" at the end. This option applies only to the patterns that are +matched against the contents of files; it does not apply to patterns specified +by any of the --include or --exclude options. +
++-Z, --null +Terminate files names in the regular output with a zero byte (the NUL +character) instead of what would normally appear. This is useful when file +names contain unusual characters such as colons, hyphens, or even newlines. The +option does not apply to file names in error messages. +
++The environment variables LC_ALL and LC_CTYPE are examined, in that +order, for a locale. The first one that is set is used. This can be overridden +by the --locale option. If no locale is set, the PCRE2 library's default +(usually the "C" locale) is used. +
++The -N (--newline) option allows pcre2grep to scan files with +newline conventions that differ from the default. This option affects only the +way scanned files are processed. It does not affect the interpretation of files +specified by the -f, --file-list, --exclude-from, or +--include-from options. +
++Any parts of the scanned input files that are written to the standard output +are copied with whatever newline sequences they have in the input. However, if +the final line of a file is output, and it does not end with a newline +sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF +or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a +single NL is used. +
++The newline setting does not affect the way in which pcre2grep writes +newlines in informational messages to the standard output and error streams. +Under Windows, the standard output is set to be binary, so that "\r\n" at the +ends of output lines that are copied from the input is not converted to +"\r\r\n" by the C I/O library. This means that any messages written to the +standard output must end with "\r\n". For all other operating systems, and +for all messages to the standard error stream, "\n" is used. +
++Many of the short and long forms of pcre2grep's options are the same as +in the GNU grep program. Any long option of the form --xxx-regexp +(GNU terminology) is also available as --xxx-regex (PCRE2 terminology). +However, the --case-restrict, --depth-limit, -E, +--file-list, --file-offsets, --heap-limit, +--include-dir, --line-offsets, --locale, --match-limit, +-M, --multiline, -N, --newline, --no-ucp, +--om-separator, --output, -P, -u, --utf, +-U, and --utf-allow-invalid options are specific to +pcre2grep, as is the use of the --only-matching option with a +capturing parentheses number. +
++Although most of the common options work the same way, a few are different in +pcre2grep. For example, the --include option's argument is a glob +for GNU grep, but in pcre2grep it is a regular expression to which +the -i option applies. If both the -c and -l options are +given, GNU grep lists only file names, without counts, but pcre2grep +gives the counts as well. +
++There are four different ways in which an option with data can be specified. +If a short form option is used, the data may follow immediately, or (with one +exception) in the next command line item. For example: +
+ -f/some/file + -f /some/file ++The exception is the -o option, which may appear with or without data. +Because of this, if data is present, it must follow immediately in the same +item, for example -o3. + +
+If a long form option is used, the data may appear in the same command line +item, separated by an equals character, or (with two exceptions) it may appear +in the next command line item. For example: +
+ --file=/some/file + --file /some/file ++Note, however, that if you want to supply a file name beginning with ~ as data +in a shell command, and have the shell expand ~ to a home directory, you must +separate the file name from the option, because the shell does not treat ~ +specially unless it is at the start of an item. + +
+The exceptions to the above are the --colour (or --color) and +--only-matching options, for which the data is optional. If one of these +options does have data, it must be given in the first form, using an equals +character. Otherwise pcre2grep will assume that it has no data. +
++pcre2grep has, by default, support for calling external programs or +scripts or echoing specific strings during matching by making use of PCRE2's +callout facility. However, this support can be completely or partially disabled +when pcre2grep is built. You can find out whether your binary has support +for callouts by running it with the --help option. If callout support is +completely disabled, all callouts in patterns are ignored by pcre2grep. +If the facility is partially disabled, calling external programs is not +supported, and callouts that request it are ignored. +
++A callout in a PCRE2 pattern is of the form (?C<arg>) where the argument is +either a number or a quoted string (see the +pcre2callout +documentation for details). Numbered callouts are ignored by pcre2grep; +only callouts with string arguments are useful. +
++Starting the callout string with a pipe character invokes an echoing facility +that avoids calling an external program or script. This facility is always +available, provided that callouts were not completely disabled when +pcre2grep was built. The rest of the callout string is processed as a +zero-terminated string, which means it should not contain any internal binary +zeros. It is written to the output, having first been passed through the same +escape processing as text from the --output (-O) option (see +above). However, $0 cannot be used to insert a matched substring because the +match is still in progress. Instead, the single character '0' is inserted. Any +syntax errors in the string (for example, a dollar not followed by another +character) causes the callout to be ignored. No terminator is added to the +output string, so if you want a newline, you must include it explicitly using +the escape $n. For example: +
+ pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file> ++Matching continues normally after the string is output. If you want to see only +the callout output but not any output from an actual match, you should end the +pattern with (*FAIL). + +
+This facility can be independently disabled when pcre2grep is built. It +is supported for Windows, where a call to _spawnvp() is used, for VMS, +where lib$spawn() is used, and for any Unix-like environment where +fork() and execv() are available. +
++If the callout string does not start with a pipe (vertical bar) character, it +is parsed into a list of substrings separated by pipe characters. The first +substring must be an executable name, with the following substrings specifying +arguments: +
+ executable_name|arg1|arg2|... ++Any substring (including the executable name) may contain escape sequences +started by a dollar character. These are the same as for the --output +(-O) option documented above, except that $0 cannot insert the matched +string because the match is still in progress. Instead, the character '0' +is inserted. If you need a literal dollar or pipe character in any +substring, use $$ or $| respectively. Here is an example: +
+ echo -e "abcde\n12345" | pcre2grep \
+ '(?x)(.)(..(.))
+ (?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -
+
+ Output:
+
+ Arg1: [a] [bcd] [d] Arg2: |a| ()
+ abcde
+ Arg1: [1] [234] [4] Arg2: |1| ()
+ 12345
+
+The parameters for the system call that is used to run the program or script
+are zero-terminated strings. This means that binary zero characters in the
+callout argument will cause premature termination of their substrings, and
+therefore should not be present. Any syntax errors in the string (for example,
+a dollar not followed by another character) causes the callout to be ignored.
+If running the program fails for any reason (including the non-existence of the
+executable), a local matching failure occurs and the matcher backtracks in the
+normal way.
+
++It is possible to supply a regular expression that takes a very long time to +fail to match certain lines. Such patterns normally involve nested indefinite +repeats, for example: (a+)*\d when matched against a line of a's with no final +digit. The PCRE2 matching function has a resource limit that causes it to abort +in these circumstances. If this happens, pcre2grep outputs an error +message and the line that caused the problem to the standard error stream. If +there are more than 20 such errors, pcre2grep gives up. +
++The --match-limit option of pcre2grep can be used to set the +overall resource limit. There are also other limits that affect the amount of +memory used during matching; see the discussion of --heap-limit and +--depth-limit above. +
++Exit status is 0 if any matches were found, 1 if no matches were found, and 2 +for syntax errors, overlong lines, non-existent or inaccessible files (even if +matches were found in other files) or too many matching errors. Using the +-s option to suppress error messages about inaccessible files does not +affect the return code. +
++When run under VMS, the return code is placed in the symbol PCRE2GREP_RC +because VMS does not distinguish between exit(0) and exit(1). +
++pcre2pattern(3), pcre2syntax(3), pcre2callout(3), +pcre2unicode(3). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 22 December 2023
+
+Copyright © 1997-2023 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2jit.html b/doc/html/pcre2jit.html new file mode 100644 index 0000000..d97d800 --- /dev/null +++ b/doc/html/pcre2jit.html @@ -0,0 +1,496 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+Just-in-time compiling is a heavyweight optimization that can greatly speed up +pattern matching. However, it comes at the cost of extra processing before the +match is performed, so it is of most benefit when the same pattern is going to +be matched many times. This does not necessarily mean many calls of a matching +function; if the pattern is not anchored, matching attempts may take place many +times at various positions in the subject, even for a single call. Therefore, +if the subject string is very long, it may still pay to use JIT even for +one-off matches. JIT support is available for all of the 8-bit, 16-bit and +32-bit PCRE2 libraries. +
++JIT support applies only to the traditional Perl-compatible matching function. +It does not apply when the DFA matching function is being used. The code for +JIT support was written by Zoltan Herczeg. +
++JIT support is an optional feature of PCRE2. The "configure" option +--enable-jit (or equivalent CMake option) must be set when PCRE2 is built if +you want to use JIT. The support is limited to the following hardware +platforms: +
+ ARM 32-bit (v7, and Thumb2) + ARM 64-bit + IBM s390x 64 bit + Intel x86 32-bit and 64-bit + LoongArch 64 bit + MIPS 32-bit and 64-bit + Power PC 32-bit and 64-bit + RISC-V 32-bit and 64-bit ++If --enable-jit is set on an unsupported platform, compilation fails. + +
+A client program can tell if JIT support is available by calling +pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if +PCRE2 was built with JIT support, and zero otherwise. However, having the JIT +code available does not guarantee that it will be used for any particular +match. One reason for this is that there are a number of options and pattern +items that are +not supported by JIT +(see below). Another reason is that in some environments JIT is unable to get +memory in which to build its compiled code. The only guarantee from +pcre2_config() is that if it returns zero, JIT will definitely not +be used. +
++A simple program does not need to check availability in order to use JIT when +possible. The API is implemented in a way that falls back to the interpretive +code if JIT is not available or cannot be used for a given match. For programs +that need the best possible performance, there is a +"fast path" +API that is JIT-specific. +
++To make use of the JIT support in the simplest way, all you have to do is to +call pcre2_jit_compile() after successfully compiling a pattern with +pcre2_compile(). This function has two arguments: the first is the +compiled pattern pointer that was returned by pcre2_compile(), and the +second is zero or more of the following option bits: PCRE2_JIT_COMPLETE, +PCRE2_JIT_PARTIAL_HARD, or PCRE2_JIT_PARTIAL_SOFT. +
++If JIT support is not available, a call to pcre2_jit_compile() does +nothing and returns PCRE2_ERROR_JIT_BADOPTION. Otherwise, the compiled pattern +is passed to the JIT compiler, which turns it into machine code that executes +much faster than the normal interpretive code, but yields exactly the same +results. The returned value from pcre2_jit_compile() is zero on success, +or a negative error code. +
++There is a limit to the size of pattern that JIT supports, imposed by the size +of machine stack that it uses. The exact rules are not documented because they +may change at any time, in particular, when new optimizations are introduced. +If a pattern is too big, a call to pcre2_jit_compile() returns +PCRE2_ERROR_NOMEMORY. +
++PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete +matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or +PCRE2_PARTIAL_SOFT options of pcre2_match(), you should set one or both +of the other options as well as, or instead of PCRE2_JIT_COMPLETE. The JIT +compiler generates different optimized code for each of the three modes +(normal, soft partial, hard partial). When pcre2_match() is called, the +appropriate code is run if it is available. Otherwise, the pattern is matched +using interpretive code. +
++You can call pcre2_jit_compile() multiple times for the same compiled +pattern. It does nothing if it has previously compiled code for any of the +option bits. For example, you can call it once with PCRE2_JIT_COMPLETE and +(perhaps later, when you find you need partial matching) again with +PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore +PCRE2_JIT_COMPLETE and just compile code for partial matching. If +pcre2_jit_compile() is called with no option bits set, it immediately +returns zero. This is an alternative way of testing whether JIT is available. +
++At present, it is not possible to free JIT compiled code except when the entire +compiled pattern is freed by calling pcre2_code_free(). +
++In some circumstances you may need to call additional functions. These are +described in the section entitled +"Controlling the JIT stack" +below. +
++There are some pcre2_match() options that are not supported by JIT, and +there are also some pattern items that JIT cannot handle. Details are given +below. +In both cases, matching automatically falls back to the interpretive code. If +you want to know whether JIT was actually used for a particular match, you +should arrange for a JIT callback function to be set up as described in the +section entitled +"Controlling the JIT stack" +below, even if you do not need to supply a non-default JIT stack. Such a +callback function is called whenever JIT code is about to be obeyed. If the +match-time options are not right for JIT execution, the callback function is +not obeyed. +
++If the JIT compiler finds an unsupported item, no JIT data is generated. You +can find out if JIT compilation was successful for a compiled pattern by +calling pcre2_pattern_info() with the PCRE2_INFO_JITSIZE option. A +non-zero result means that JIT compilation was successful. A result of 0 means +that JIT support is not available, or the pattern was not processed by +pcre2_jit_compile(), or the JIT compiler was not able to handle the +pattern. Successful JIT compilation does not, however, guarantee the use of JIT +at match time because there are some match time options that are not supported +by JIT. +
++When a pattern is compiled with the PCRE2_UTF option, subject strings are +normally expected to be a valid sequence of UTF code units. By default, this is +checked at the start of matching and an error is generated if invalid UTF is +detected. The PCRE2_NO_UTF_CHECK option can be passed to pcre2_match() to +skip the check (for improved performance) if you are sure that a subject string +is valid. If this option is used with an invalid string, the result is +undefined. The calling program may crash or loop or otherwise misbehave. +
++However, a way of running matches on strings that may contain invalid UTF +sequences is available. Calling pcre2_compile() with the +PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in +pcre2_match() to support invalid UTF, and, if pcre2_jit_compile() +is subsequently called, the compiled JIT code also supports invalid UTF. +Details of how this support works, in both the JIT and the interpretive cases, +is given in the +pcre2unicode +documentation. +
++There is also an obsolete option for pcre2_jit_compile() called +PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility. +It is superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF +and should no longer be used. It may be removed in future. +
++The pcre2_match() options that are supported for JIT matching are +PCRE2_COPY_MATCHED_SUBJECT, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, +PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and +PCRE2_PARTIAL_SOFT. The PCRE2_ANCHORED and PCRE2_ENDANCHORED options are not +supported at match time. +
++If the PCRE2_NO_JIT option is passed to pcre2_match() it disables the +use of JIT, forcing matching by the interpreter code. +
++The only unsupported pattern items are \C (match a single data unit) when +running in a UTF mode, and a callout immediately before an assertion condition +in a conditional group. +
++When a pattern is matched using JIT, the return values are the same as those +given by the interpretive pcre2_match() code, with the addition of one +new error code: PCRE2_ERROR_JIT_STACKLIMIT. This means that the memory used for +the JIT stack was insufficient. See +"Controlling the JIT stack" +below for a discussion of JIT stack usage. +
++The error code PCRE2_ERROR_MATCHLIMIT is returned by the JIT code if searching +a very large pattern tree goes on for too long, as it is in the same +circumstance when JIT is not used, but the details of exactly what is counted +are not the same. The PCRE2_ERROR_DEPTHLIMIT error code is never returned +when JIT matching is used. +
++When the compiled JIT code runs, it needs a block of memory to use as a stack. +By default, it uses 32KiB on the machine stack. However, some large or +complicated patterns need more than this. The error PCRE2_ERROR_JIT_STACKLIMIT +is given when there is not enough stack. Three functions are provided for +managing blocks of memory for use as JIT stacks. There is further discussion +about the use of JIT stacks in the section entitled +"JIT stack FAQ" +below. +
++The pcre2_jit_stack_create() function creates a JIT stack. Its arguments +are a starting size, a maximum size, and a general context (for memory +allocation functions, or NULL for standard memory allocation). It returns a +pointer to an opaque structure of type pcre2_jit_stack, or NULL if there +is an error. The pcre2_jit_stack_free() function is used to free a stack +that is no longer needed. If its argument is NULL, this function returns +immediately, without doing anything. (For the technically minded: the address +space is allocated by mmap or VirtualAlloc.) A maximum stack size of 512KiB to +1MiB should be more than enough for any pattern. +
++The pcre2_jit_stack_assign() function specifies which stack JIT code +should use. Its arguments are as follows: +
+ pcre2_match_context *mcontext + pcre2_jit_callback callback + void *data ++The first argument is a pointer to a match context. When this is subsequently +passed to a matching function, its information determines which JIT stack is +used. If this argument is NULL, the function returns immediately, without doing +anything. There are three cases for the values of the other two options: +
+ (1) If callback is NULL and data is NULL, an internal 32KiB block + on the machine stack is used. This is the default when a match + context is created. + + (2) If callback is NULL and data is not NULL, data must be + a pointer to a valid JIT stack, the result of calling + pcre2_jit_stack_create(). + + (3) If callback is not NULL, it must point to a function that is + called with data as an argument at the start of matching, in + order to set up a JIT stack. If the return from the callback + function is NULL, the internal 32KiB stack is used; otherwise the + return value must be a valid JIT stack, the result of calling + pcre2_jit_stack_create(). ++A callback function is obeyed whenever JIT code is about to be run; it is not +obeyed when pcre2_match() is called with options that are incompatible +for JIT matching. A callback function can therefore be used to determine +whether a match operation was executed by JIT or by the interpreter. + +
+You may safely use the same JIT stack for more than one pattern (either by +assigning directly or by callback), as long as the patterns are matched +sequentially in the same thread. Currently, the only way to set up +non-sequential matches in one thread is to use callouts: if a callout function +starts another match, that match must use a different JIT stack to the one used +for currently suspended match(es). +
++In a multithread application, if you do not specify a JIT stack, or if you +assign or pass back NULL from a callback, that is thread-safe, because each +thread has its own machine stack. However, if you assign or pass back a +non-NULL JIT stack, this must be a different stack for each thread so that the +application is thread-safe. +
++Strictly speaking, even more is allowed. You can assign the same non-NULL stack +to a match context that is used by any number of patterns, as long as they are +not used for matching by multiple threads at the same time. For example, you +could use the same stack in all compiled patterns, with a global mutex in the +callback to wait until the stack is available for use. However, this is an +inefficient solution, and not recommended. +
++This is a suggestion for how a multithreaded program that needs to set up +non-default JIT stacks might operate: +
+ During thread initialization + thread_local_var = pcre2_jit_stack_create(...) + + During thread exit + pcre2_jit_stack_free(thread_local_var) + + Use a one-line callback function + return thread_local_var ++All the functions described in this section do nothing if JIT is not available. + +
+(1) Why do we need JIT stacks?
+
+
+PCRE2 (and JIT) is a recursive, depth-first engine, so it needs a stack where
+the local data of the current node is pushed before checking its child nodes.
+Allocating real machine stack on some platforms is difficult. For example, the
+stack chain needs to be updated every time if we extend the stack on PowerPC.
+Although it is possible, its updating time overhead decreases performance. So
+we do the recursion in memory.
+
+(2) Why don't we simply allocate blocks of memory with malloc()?
+
+
+Modern operating systems have a nice feature: they can reserve an address space
+instead of allocating memory. We can safely allocate memory pages inside this
+address space, so the stack could grow without moving memory data (this is
+important because of pointers). Thus we can allocate 1MiB address space, and
+use only a single memory page (usually 4KiB) if that is enough. However, we can
+still grow up to 1MiB anytime if needed.
+
+(3) Who "owns" a JIT stack?
+
+
+The owner of the stack is the user program, not the JIT studied pattern or
+anything else. The user program must ensure that if a stack is being used by
+pcre2_match(), (that is, it is assigned to a match context that is passed
+to the pattern currently running), that stack must not be used by any other
+threads (to avoid overwriting the same memory area). The best practice for
+multithreaded programs is to allocate a stack for each thread, and return this
+stack through the JIT callback function.
+
+(4) When should a JIT stack be freed?
+
+
+You can free a JIT stack at any time, as long as it will not be used by
+pcre2_match() again. When you assign the stack to a match context, only a
+pointer is set. There is no reference counting or any other magic. You can free
+compiled patterns, contexts, and stacks in any order, anytime.
+Just do not call pcre2_match() with a match context pointing to an
+already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
+currently used by pcre2_match() in another thread). You can also replace
+the stack in a context at any time when it is not in use. You should free the
+previous stack before assigning a replacement.
+
+(5) Should I allocate/free a stack every time before/after calling
+pcre2_match()?
+
+
+No, because this is too costly in terms of resources. However, you could
+implement some clever idea which release the stack if it is not used in let's
+say two minutes. The JIT callback can help to achieve this without keeping a
+list of patterns.
+
+(6) OK, the stack is for long term memory allocation. But what happens if a
+pattern causes stack overflow with a stack of 1MiB? Is that 1MiB kept until the
+stack is freed?
+
+
+Especially on embedded systems, it might be a good idea to release memory
+sometimes without freeing the stack. There is no API for this at the moment.
+Probably a function call which returns with the currently allocated memory for
+any stack and another which allows releasing memory (shrinking the stack) would
+be a good idea if someone needs this.
+
+(7) This is too much of a headache. Isn't there any better solution for JIT
+stack handling?
+
+
+No, thanks to Windows. If POSIX threads were used everywhere, we could throw
+out this complicated API.
+
+void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); +
++The JIT executable allocator does not free all memory when it is possible. It +expects new allocations, and keeps some free memory around to improve +allocation speed. However, in low memory conditions, it might be better to free +all possible memory. You can cause this to happen by calling +pcre2_jit_free_unused_memory(). Its argument is a general context, for custom +memory management, or NULL for standard memory management. +
++This is a single-threaded example that specifies a JIT stack without using a +callback. A real program should include error checking after all the function +calls. +
+ int rc; + pcre2_code *re; + pcre2_match_data *match_data; + pcre2_match_context *mcontext; + pcre2_jit_stack *jit_stack; + + re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0, + &errornumber, &erroffset, NULL); + rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); + mcontext = pcre2_match_context_create(NULL); + jit_stack = pcre2_jit_stack_create(32*1024, 512*1024, NULL); + pcre2_jit_stack_assign(mcontext, NULL, jit_stack); + match_data = pcre2_match_data_create(re, 10); + rc = pcre2_match(re, subject, length, 0, 0, match_data, mcontext); + /* Process result */ + + pcre2_code_free(re); + pcre2_match_data_free(match_data); + pcre2_match_context_free(mcontext); + pcre2_jit_stack_free(jit_stack); + ++ +
+Because the API described above falls back to interpreted matching when JIT is +not available, it is convenient for programs that are written for general use +in many environments. However, calling JIT via pcre2_match() does have a +performance impact. Programs that are written for use where JIT is known to be +available, and which need the best possible performance, can instead use a +"fast path" API to call JIT matching directly instead of calling +pcre2_match() (obviously only for patterns that have been successfully +processed by pcre2_jit_compile()). +
++The fast path function is called pcre2_jit_match(), and it takes exactly +the same arguments as pcre2_match(). However, the subject string must be +specified with a length; PCRE2_ZERO_TERMINATED is not supported. Unsupported +option bits (for example, PCRE2_ANCHORED and PCRE2_ENDANCHORED) are ignored, as +is the PCRE2_NO_JIT option. The return values are also the same as for +pcre2_match(), plus PCRE2_ERROR_JIT_BADOPTION if a matching mode (partial +or complete) is requested that was not compiled. +
++When you call pcre2_match(), as well as testing for invalid options, a +number of other sanity checks are performed on the arguments. For example, if +the subject pointer is NULL but the length is non-zero, an immediate error is +given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested +for validity. In the interests of speed, these checks do not happen on the JIT +fast path. If invalid UTF data is passed when PCRE2_MATCH_INVALID_UTF was not +set for pcre2_compile(), the result is undefined. The program may crash +or loop or give wrong results. In the absence of PCRE2_MATCH_INVALID_UTF you +should call pcre2_jit_match() in UTF mode only if you are sure the +subject is valid. +
++Bypassing the sanity checks and the pcre2_match() wrapping can give +speedups of more than 10%. +
++pcre2api(3), pcre2unicode(3) +
+
+Philip Hazel (FAQ by Zoltan Herczeg)
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 21 February 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2limits.html b/doc/html/pcre2limits.html new file mode 100644 index 0000000..8152ed2 --- /dev/null +++ b/doc/html/pcre2limits.html @@ -0,0 +1,105 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SIZE AND OTHER LIMITATIONS
+
+
+There are some size limitations in PCRE2 but it is hoped that they will never +in practice be relevant. +
++The maximum size of a compiled pattern is approximately 64 thousand code units +for the 8-bit and 16-bit libraries if PCRE2 is compiled with the default +internal linkage size, which is 2 bytes for these libraries. If you want to +process regular expressions that are truly enormous, you can compile PCRE2 with +an internal linkage size of 3 or 4 (when building the 16-bit library, 3 is +rounded up to 4). See the README file in the source distribution and the +pcre2build +documentation for details. In these cases the limit is substantially larger. +However, the speed of execution is slower. In the 32-bit library, the internal +linkage size is always 4. +
++The maximum length of a source pattern string is essentially unlimited; it is +the largest number a PCRE2_SIZE variable can hold. However, the program that +calls pcre2_compile() can specify a smaller limit. +
++The maximum length (in code units) of a subject string is one less than the +largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned +integer type, usually defined as size_t. Its maximum value (that is +~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings +and unset offsets. +
++All values in repeating quantifiers must be less than 65536. +
++There are two different limits that apply to branches of lookbehind assertions. +If every branch in such an assertion matches a fixed number of characters, +the maximum length of any branch is 65535 characters. If any branch matches a +variable number of characters, then the maximum matching length for every +branch is limited. The default limit is set at compile time, defaulting to 255, +but can be changed by the calling program. +
++There is no limit to the number of parenthesized groups, but there can be no +more than 65535 capture groups, and there is a limit to the depth of nesting of +parenthesized subpatterns of all kinds. This is imposed in order to limit the +amount of system stack used at compile time. The default limit can be specified +when PCRE2 is built; if not, the default is set to 250. An application can +change this limit by calling pcre2_set_parens_nest_limit() to set the limit in +a compile context. +
++The maximum length of name for a named capture group is 32 code units, and the +maximum number of such groups is 10000. +
++The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb +is 255 code units for the 8-bit library and 65535 code units for the 16-bit and +32-bit libraries. +
++The maximum length of a string argument to a callout is the largest number a +32-bit unsigned integer can hold. +
++The maximum amount of heap memory used for matching is controlled by the heap +limit, which can be set in a pattern or in a match context. The default is a +very large number, effectively unlimited. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: August 2023
+
+Copyright © 1997-2023 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2matching.html b/doc/html/pcre2matching.html new file mode 100644 index 0000000..3b8b629 --- /dev/null +++ b/doc/html/pcre2matching.html @@ -0,0 +1,253 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+This document describes the two different algorithms that are available in +PCRE2 for matching a compiled regular expression against a given subject +string. The "standard" algorithm is the one provided by the pcre2_match() +function. This works in the same as Perl's matching function, and provide a +Perl-compatible matching operation. The just-in-time (JIT) optimization that is +described in the +pcre2jit +documentation is compatible with this function. +
++An alternative algorithm is provided by the pcre2_dfa_match() function; +it operates in a different way, and is not Perl-compatible. This alternative +has advantages and disadvantages compared with the standard algorithm, and +these are described below. +
++When there is only one possible way in which a given subject string can match a +pattern, the two algorithms give the same answer. A difference arises, however, +when there are multiple possibilities. For example, if the pattern +
+ ^<.*> ++is matched against the string +
+ <something> <something else> <something further> ++there are three possible answers. The standard algorithm finds only one of +them, whereas the alternative algorithm finds all three. + +
+The set of strings that are matched by a regular expression can be represented +as a tree structure. An unlimited repetition in the pattern makes the tree of +infinite size, but it is still a tree. Matching the pattern to a given subject +string (from a given starting point) can be thought of as a search of the tree. +There are two ways to search a tree: depth-first and breadth-first, and these +correspond to the two matching algorithms provided by PCRE2. +
++In the terminology of Jeffrey Friedl's book "Mastering Regular Expressions", +the standard algorithm is an "NFA algorithm". It conducts a depth-first search +of the pattern tree. That is, it proceeds along a single path through the tree, +checking that the subject matches what is required. When there is a mismatch, +the algorithm tries any alternatives at the current point, and if they all +fail, it backs up to the previous branch point in the tree, and tries the next +alternative branch at that level. This often involves backing up (moving to the +left) in the subject string as well. The order in which repetition branches are +tried is controlled by the greedy or ungreedy nature of the quantifier. +
++If a leaf node is reached, a matching string has been found, and at that point +the algorithm stops. Thus, if there is more than one possible match, this +algorithm returns the first one that it finds. Whether this is the shortest, +the longest, or some intermediate length depends on the way the alternations +and the greedy or ungreedy repetition quantifiers are specified in the +pattern. +
++Because it ends up with a single path through the tree, it is relatively +straightforward for this algorithm to keep track of the substrings that are +matched by portions of the pattern in parentheses. This provides support for +capturing parentheses and backreferences. +
++This algorithm conducts a breadth-first search of the tree. Starting from the +first matching point in the subject, it scans the subject string from left to +right, once, character by character, and as it does this, it remembers all the +paths through the tree that represent valid matches. In Friedl's terminology, +this is a kind of "DFA algorithm", though it is not implemented as a +traditional finite state machine (it keeps multiple states active +simultaneously). +
++Although the general principle of this matching algorithm is that it scans the +subject string only once, without backtracking, there is one exception: when a +lookaround assertion is encountered, the characters following or preceding the +current point have to be independently inspected. +
++The scan continues until either the end of the subject is reached, or there are +no more unterminated paths. At this point, terminated paths represent the +different matching possibilities (if there are none, the match has failed). +Thus, if there is more than one possible match, this algorithm finds all of +them, and in particular, it finds the longest. The matches are returned in +the output vector in decreasing order of length. There is an option to stop the +algorithm after the first match (which is necessarily the shortest) is found. +
++Note that the size of vector needed to contain all the results depends on the +number of simultaneous matches, not on the number of parentheses in the +pattern. Using pcre2_match_data_create_from_pattern() to create the match +data block is therefore not advisable when doing DFA matching. +
++Note also that all the matches that are found start at the same point in the +subject. If the pattern +
+ cat(er(pillar)?)? ++is matched against the string "the caterpillar catchment", the result is the +three strings "caterpillar", "cater", and "cat" that start at the fifth +character of the subject. The algorithm does not automatically move on to find +matches that start at later positions. + +
+PCRE2's "auto-possessification" optimization usually applies to character +repeats at the end of a pattern (as well as internally). For example, the +pattern "a\d+" is compiled as if it were "a\d++" because there is no point +even considering the possibility of backtracking into the repeated digits. For +DFA matching, this means that only one possible match is found. If you really +do want multiple matches in such cases, either use an ungreedy repeat +("a\d+?") or set the PCRE2_NO_AUTO_POSSESS option when compiling. +
++There are a number of features of PCRE2 regular expressions that are not +supported or behave differently in the alternative matching function. Those +that are not supported cause an error if encountered. +
++1. Because the algorithm finds all possible matches, the greedy or ungreedy +nature of repetition quantifiers is not relevant (though it may affect +auto-possessification, as just described). During matching, greedy and ungreedy +quantifiers are treated in exactly the same way. However, possessive +quantifiers can make a difference when what follows could also match what is +quantified, for example in a pattern like this: +
+ ^a++\w! ++This pattern matches "aaab!" but not "aaa!", which would be matched by a +non-possessive quantifier. Similarly, if an atomic group is present, it is +matched as if it were a standalone pattern at the current point, and the +longest match is then "locked in" for the rest of the overall pattern. + +
+2. When dealing with multiple paths through the tree simultaneously, it is not +straightforward to keep track of captured substrings for the different matching +possibilities, and PCRE2's implementation of this algorithm does not attempt to +do this. This means that no captured substrings are available. +
++3. Because no substrings are captured, backreferences within the pattern are +not supported. +
++4. For the same reason, conditional expressions that use a backreference as the +condition or test for a specific group recursion are not supported. +
++5. Again for the same reason, script runs are not supported. +
++6. Because many paths through the tree may be active, the \K escape sequence, +which resets the start of the match when encountered (but may be on some paths +and not on others), is not supported. +
++7. Callouts are supported, but the value of the capture_top field is +always 1, and the value of the capture_last field is always 0. +
++8. The \C escape sequence, which (in the standard algorithm) always matches a +single code unit, even in a UTF mode, is not supported in these modes, because +the alternative algorithm moves through the subject string one character (not +code unit) at a time, for all active paths through the tree. +
++9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not +supported. (*FAIL) is supported, and behaves like a failing negative assertion. +
++10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not +supported by pcre2_dfa_match(). +
++The main advantage of the alternative algorithm is that all possible matches +(at a single point in the subject) are automatically found, and in particular, +the longest match is found. To find more than one match at the same point using +the standard algorithm, you have to do kludgy things with callouts. +
++Partial matching is possible with this algorithm, though it has some +limitations. The +pcre2partial +documentation gives details of partial matching and discusses multi-segment +matching. +
++The alternative algorithm suffers from a number of disadvantages: +
++1. It is substantially slower than the standard algorithm. This is partly +because it has to search for all possible matches, but is also because it is +less susceptible to optimization. +
++2. Capturing parentheses, backreferences, script runs, and matching within +invalid UTF string are not supported. +
++3. Although atomic groups are supported, their use does not provide the +performance advantage that it does for the standard algorithm. +
++4. JIT optimization is not supported. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 19 January 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2partial.html b/doc/html/pcre2partial.html new file mode 100644 index 0000000..64116c4 --- /dev/null +++ b/doc/html/pcre2partial.html @@ -0,0 +1,408 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+In normal use of PCRE2, if there is a match up to the end of a subject string, +but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH +is returned, just like any other failing match. There are circumstances where +it might be helpful to distinguish this "partial match" case. +
++One example is an application where the subject string is very long, and not +all available at once. The requirement here is to be able to do the matching +segment by segment, but special action is needed when a matched substring spans +the boundary between two segments. +
++Another example is checking a user input string as it is typed, to ensure that +it conforms to a required format. Invalid characters can be immediately +diagnosed and rejected, giving instant feedback. +
++Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is +requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT +options when calling a matching function. The difference between the two +options is whether or not a partial match is preferred to an alternative +complete match, though the details differ between the two types of matching +function. If both options are set, PCRE2_PARTIAL_HARD takes precedence. +
++If you want to use partial matching with just-in-time optimized code, as well +as setting a partial match option for the matching function, you must also call +pcre2_jit_compile() with one or both of these options: +
+ PCRE2_JIT_PARTIAL_HARD + PCRE2_JIT_PARTIAL_SOFT ++PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial +matches on the same pattern. Separate code is compiled for each mode. If the +appropriate JIT mode has not been compiled, interpretive matching code is used. + +
+Setting a partial matching option disables two of PCRE2's standard +optimization hints. PCRE2 remembers the last literal code unit in a pattern, +and abandons matching immediately if it is not present in the subject string. +This optimization cannot be used for a subject string that might match only +partially. PCRE2 also remembers a minimum length of a matching string, and does +not bother to run the matching function on shorter strings. This optimization +is also disabled for partial matching. +
++A possible partial match occurs during matching when the end of the subject +string is reached successfully, but either more characters are needed to +complete the match, or the addition of more characters might change what is +matched. +
++Example 1: if the pattern is /abc/ and the subject is "ab", more characters are +definitely needed to complete a match. In this case both hard and soft matching +options yield a partial match. +
++Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match +can be found, but the addition of more characters might change what is +matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match; +PCRE2_PARTIAL_SOFT returns the complete match. +
++On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next +pattern item is \z, \Z, \b, \B, or $ there is always a partial match. +Otherwise, for both options, the next pattern item must be one that inspects a +character, and at least one of the following must be true: +
++(1) At least one character has already been inspected. An inspected character +need not form part of the final matched string; lookbehind assertions and the +\K escape sequence provide ways of inspecting characters before the start of a +matched string. +
++(2) The pattern contains one or more lookbehind assertions. This condition +exists in case there is a lookbehind that inspects characters before the start +of the match. +
++(3) There is a special case when the whole pattern can match an empty string. +When the starting point is at the end of the subject, the empty string match is +a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above +conditions is true, it is returned. However, because adding more characters +might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match, +which in this case means "there is going to be a match at this point, but until +some more characters are added, we do not know if it will be an empty string or +something longer". +
++When a partial matching option is set, the result of calling +pcre2_match() can be one of the following: +
++A successful match +A complete match has been found, starting and ending within this subject. +
++PCRE2_ERROR_NOMATCH +No match can start anywhere in this subject. +
++PCRE2_ERROR_PARTIAL +Adding more characters may result in a complete match that uses one or more +characters from the end of this subject. +
++When a partial match is returned, the first two elements in the ovector point +to the portion of the subject that was matched, but the values in the rest of +the ovector are undefined. The appearance of \K in the pattern has no effect +for a partial match. Consider this pattern: +
+ /abc\K123/ ++If it is matched against "456abc123xyz" the result is a complete match, and the +ovector defines the matched string as "123", because \K resets the "start of +match" point. However, if a partial match is requested and the subject string +is "456abc12", a partial match is found for the string "abc12", because all +these characters are needed for a subsequent re-match with additional +characters. + +
+If there is more than one partial match, the first one that was found provides +the data that is returned. Consider this pattern: +
+ /123\w+X|dogY/ ++If this is matched against the subject string "abc123dog", both alternatives +fail to match, but the end of the subject is reached during matching, so +PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying +"123dog" as the first partial match. (In this example, there are two partial +matches, because "dog" on its own partially matches the second alternative.) + +
+What happens when a partial match is identified depends on which of the two +partial matching options is set. +
++If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a +partial match is found, without continuing to search for possible complete +matches. This option is "hard" because it prefers an earlier partial match over +a later complete match. For this reason, the assumption is made that the end of +the supplied subject string is not the true end of the available data, which is +why \z, \Z, \b, \B, and $ always give a partial match. +
++If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching +continues as normal, and other alternatives in the pattern are tried. If no +complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of +PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match +over a partial match. All the various matching items in a pattern behave as if +the subject string is potentially complete; \z, \Z, and $ match at the end of +the subject, as normal, and for \b and \B the end of the subject is treated +as a non-alphanumeric. +
++The difference between the two partial matching options can be illustrated by a +pattern such as: +
+ /dog(sbody)?/ ++This matches either "dog" or "dogsbody", greedily (that is, it prefers the +longer string if possible). If it is matched against the string "dog" with +PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if +PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other +hand, if the pattern is made ungreedy the result is different: +
+ /dog(sbody)??/ ++In this case the result is always a complete match because that is found first, +and matching never continues after finding a complete match. It might be easier +to follow this explanation by thinking of the two patterns like this: +
+ /dog(sbody)?/ is the same as /dogsbody|dog/ + /dog(sbody)??/ is the same as /dog|dogsbody/ ++The second pattern will never match "dogsbody", because it will always find the +shorter match first. + +
+The pcre2test data modifiers partial_hard (or ph) and +partial_soft (or ps) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT, +respectively, when calling pcre2_match(). Here is a run of +pcre2test using a pattern that matches the whole subject in the form of a +date: +
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ + data> 25dec3\=ph + Partial match: 23dec3 + data> 3ju\=ph + Partial match: 3ju + data> 3juj\=ph + No match ++This example gives the same results for both hard and soft partial matching +options. Here is an example where there is a difference: +
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ + data> 25jun04\=ps + 0: 25jun04 + 1: jun + data> 25jun04\=ph + Partial match: 25jun04 ++With PCRE2_PARTIAL_SOFT, the subject is matched completely. For +PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so +there is only a partial match. + +
+PCRE was not originally designed with multi-segment matching in mind. However, +over time, features (including partial matching) that make multi-segment +matching possible have been added. A very long string can be searched segment +by segment by calling pcre2_match() repeatedly, with the aim of achieving +the same results that would happen if the entire string was available for +searching all the time. Normally, the strings that are being sought are much +shorter than each individual segment, and are in the middle of very long +strings, so the pattern is normally not anchored. +
++Special logic must be implemented to handle a matched substring that spans a +segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a +partial match at the end of a segment whenever there is the possibility of +changing the match by adding more characters. The PCRE2_NOTBOL option should +also be set for all but the first segment. +
++When a partial match occurs, the next segment must be added to the current +subject and the match re-run, using the startoffset argument of +pcre2_match() to begin at the point where the partial match started. +For example: +
+ re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/ + data> ...the date is 23ja\=ph + Partial match: 23ja + data> ...the date is 23jan19 and on that day...\=offset=15 + 0: 23jan19 + 1: jan ++Note the use of the offset modifier to start the new match where the +partial match was found. In this example, the next segment was added to the one +in which the partial match was found. This is the most straightforward +approach, typically using a memory buffer that is twice the size of each +segment. After a partial match, the first half of the buffer is discarded, the +second half is moved to the start of the buffer, and a new segment is added +before repeating the match as in the example above. After a no match, the +entire buffer can be discarded. + +
+If there are memory constraints, you may want to discard text that precedes a +partial match before adding the next segment. Unfortunately, this is not at +present straightforward. In cases such as the above, where the pattern does not +contain any lookbehinds, it is sufficient to retain only the partially matched +substring. However, if the pattern contains a lookbehind assertion, characters +that precede the start of the partial match may have been inspected during the +matching process. When pcre2test displays a partial match, it indicates +these characters with '<' if the allusedtext modifier is set: +
+ re> "(?<=123)abc" + data> xx123ab\=ph,allusedtext + Partial match: 123ab + <<< ++However, the allusedtext modifier is not available for JIT matching, +because JIT matching does not record the first (or last) consulted characters. +For this reason, this information is not available via the API. It is therefore +not possible in general to obtain the exact number of characters that must be +retained in order to get the right match result. If you cannot retain the +entire segment, you must find some heuristic way of choosing. + +
+If you know the approximate length of the matching substrings, you can use that +to decide how much text to retain. The only lookbehind information that is +currently available via the API is the length of the longest individual +lookbehind in a pattern, but this can be misleading if there are nested +lookbehinds. The value returned by calling pcre2_pattern_info() with the +PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code +units) that any individual lookbehind moves back when it is processed. A +pattern such as "(?<=(?<!b)a)" has a maximum lookbehind value of one, but +inspects two characters before its starting point. +
++In a non-UTF or a 32-bit case, moving back is just a subtraction, but in +UTF-8 or UTF-16 you have to count characters while moving back through the code +units. +
++The DFA function moves along the subject string character by character, without +backtracking, searching for all possible matches simultaneously. If the end of +the subject is reached before the end of the pattern, there is the possibility +of a partial match. +
++When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there +have been no complete matches. Otherwise, the complete matches are returned. +If PCRE2_PARTIAL_HARD is set, a partial match takes precedence over any +complete matches. The portion of the string that was matched when the longest +partial match was found is set as the first matching string. +
++Because the DFA function always searches for all possible matches, and there is +no difference between greedy and ungreedy repetition, its behaviour is +different from the pcre2_match(). Consider the string "dog" matched +against this ungreedy pattern: +
+ /dog(sbody)??/ ++Whereas the standard function stops as soon as it finds the complete match for +"dog", the DFA function also finds the partial match for "dogsbody", and so +returns that when PCRE2_PARTIAL_HARD is set. + +
+When a partial match has been found using the DFA matching function, it is +possible to continue the match by providing additional subject data and calling +the function again with the same compiled regular expression, this time setting +the PCRE2_DFA_RESTART option. You must pass the same working space as before, +because this is where details of the previous partial match are stored. You can +set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with PCRE2_DFA_RESTART +to continue partial matching over multiple segments. Here is an example using +pcre2test: +
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ + data> 23ja\=dfa,ps + Partial match: 23ja + data> n05\=dfa,dfa_restart + 0: n05 ++The first call has "23ja" as the subject, and requests partial matching; the +second call has "n05" as the subject for the continued (restarted) match. +Notice that when the match is complete, only the last part is shown; PCRE2 does +not retain the previously partially-matched string. It is up to the calling +program to do that if it needs to. This means that, for an unanchored pattern, +if a continued match fails, it is not possible to try again at a new starting +point. All this facility is capable of doing is continuing with the previous +match attempt. For example, consider this pattern: +
+ 1234|3789 ++If the first part of the subject is "ABC123", a partial match of the first +alternative is found at offset 3. There is no partial match for the second +alternative, because such a match does not start at the same point in the +subject string. Attempting to continue with the string "7890" does not yield a +match because only those alternatives that match at one point in the subject +are remembered. Depending on the application, this may or may not be what you +want. + +
+If you do want to allow for starting again at the next character, one way of +doing it is to retain some or all of the segment and try a new complete match, +as described for pcre2_match() above. Another possibility is to work with +two buffers. If a partial match at offset n in the first buffer is +followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you +can then try a new match starting at offset n+1 in the first buffer. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 04 September 2019
+
+Copyright © 1997-2019 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html new file mode 100644 index 0000000..cf50c1a --- /dev/null +++ b/doc/html/pcre2pattern.html @@ -0,0 +1,3855 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+The syntax and semantics of the regular expressions that are supported by PCRE2 +are described in detail below. There is a quick-reference syntax summary in the +pcre2syntax +page. PCRE2 tries to match Perl syntax and semantics as closely as it can. +PCRE2 also supports some alternative regular expression syntax (which does not +conflict with the Perl syntax) in order to provide some compatibility with +regular expressions in Python, .NET, and Oniguruma. +
++Perl's regular expressions are described in its own documentation, and regular +expressions in general are covered in a number of books, some of which have +copious examples. Jeffrey Friedl's "Mastering Regular Expressions", published +by O'Reilly, covers regular expressions in great detail. This description of +PCRE2's regular expressions is intended as reference material. +
++This document discusses the regular expression patterns that are supported by +PCRE2 when its main matching function, pcre2_match(), is used. PCRE2 also +has an alternative matching function, pcre2_dfa_match(), which matches +using a different algorithm that is not Perl-compatible. Some of the features +discussed below are not available when DFA matching is used. The advantages and +disadvantages of the alternative function, and how it differs from the normal +function, are discussed in the +pcre2matching +page. +
++A number of options that can be passed to pcre2_compile() can also be set +by special items at the start of a pattern. These are not Perl-compatible, but +are provided to make these options accessible to pattern writers who are not +able to change the program that processes the pattern. Any number of these +items may appear, but they must all be together right at the start of the +pattern string, and the letters must be in upper case. +
++In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as +single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be +specified for the 32-bit library, in which case it constrains the character +values to valid Unicode code points. To process UTF strings, PCRE2 must be +built to include Unicode support (which is the default). When using UTF strings +you must either call the compiling function with one or both of the PCRE2_UTF +or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special +sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How +setting a UTF mode affects pattern matching is mentioned in several places +below. There is also a summary of features in the +pcre2unicode +page. +
++Some applications that allow their users to supply patterns may wish to +restrict them to non-UTF data for security reasons. If the PCRE2_NEVER_UTF +option is passed to pcre2_compile(), (*UTF) is not allowed, and its +appearance in a pattern causes an error. +
++Another special sequence that may appear at the start of a pattern is (*UCP). +This has the same effect as setting the PCRE2_UCP option: it causes sequences +such as \d and \w to use Unicode properties to determine character types, +instead of recognizing only characters with codes less than 256 via a lookup +table. If also causes upper/lower casing operations to use Unicode properties +for characters with code points greater than 127, even when UTF is not set. +These behaviours can be changed within the pattern; see the section entitled +"Internal Option Setting" +below. +
++Some applications that allow their users to supply patterns may wish to +restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to +pcre2_compile(), (*UCP) is not allowed, and its appearance in a pattern +causes an error. +
++Starting a pattern with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) has the same effect +as passing the PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART option to whichever +matching function is subsequently called to match the pattern. These options +lock out the matching of empty strings, either entirely, or only at the start +of the subject. +
++If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting +the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making quantifiers +possessive when what follows cannot match the repeated item. For example, by +default a+b is treated as a++b. For more details, see the +pcre2api +documentation. +
++If a pattern starts with (*NO_START_OPT), it has the same effect as setting the +PCRE2_NO_START_OPTIMIZE option. This disables several optimizations for quickly +reaching "no match" results. For more details, see the +pcre2api +documentation. +
++If a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as +setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimizations that +apply to patterns whose top-level branches all start with .* (match any number +of arbitrary characters). For more details, see the +pcre2api +documentation. +
++If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by +the application to apply the JIT optimization by calling +pcre2_jit_compile() is ignored. +
++The pcre2_match() function contains a counter that is incremented every +time it goes round its main loop. The caller of pcre2_match() can set a +limit on this counter, which therefore limits the amount of computing resource +used for a match. The maximum depth of nested backtracking can also be limited; +this indirectly restricts the amount of heap memory that is used, but there is +also an explicit memory limit that can be set. +
++These facilities are provided to catch runaway matches that are provoked by +patterns with huge matching trees. A common example is a pattern with nested +unlimited repeats applied to a long string that does not match. When one of +these limits is reached, pcre2_match() gives an error return. The limits +can also be set by items at the start of the pattern of the form +
+ (*LIMIT_HEAP=d) + (*LIMIT_MATCH=d) + (*LIMIT_DEPTH=d) ++where d is any number of decimal digits. However, the value of the setting must +be less than the value set (or defaulted) by the caller of pcre2_match() +for it to have any effect. In other words, the pattern writer can lower the +limits set by the programmer, but not raise them. If there is more than one +setting of one of these limits, the lower value is used. The heap limit is +specified in kibibytes (units of 1024 bytes). + +
+Prior to release 10.30, LIMIT_DEPTH was called LIMIT_RECURSION. This name is +still recognized for backwards compatibility. +
++The heap limit applies only when the pcre2_match() or +pcre2_dfa_match() interpreters are used for matching. It does not apply +to JIT. The match limit is used (but in a different way) when JIT is being +used, or when pcre2_dfa_match() is called, to limit computing resource +usage by those matching functions. The depth limit is ignored by JIT but is +relevant for DFA matching, which uses function recursion for recursions within +the pattern and for lookaround assertions and atomic groups. In this case, the +depth limit controls the depth of such recursion. +
++PCRE2 supports six different conventions for indicating line breaks in +strings: a single CR (carriage return) character, a single LF (linefeed) +character, the two-character sequence CRLF, any of the three preceding, any +Unicode newline sequence, or the NUL character (binary zero). The +pcre2api +page has +further discussion +about newlines, and shows how to set the newline convention when calling +pcre2_compile(). +
++It is also possible to specify a newline convention by starting a pattern +string with one of the following sequences: +
+ (*CR) carriage return + (*LF) linefeed + (*CRLF) carriage return, followed by linefeed + (*ANYCRLF) any of the three above + (*ANY) all Unicode newline sequences + (*NUL) the NUL character (binary zero) ++These override the default and the options given to the compiling function. For +example, on a Unix system where LF is the default newline sequence, the pattern +
+ (*CR)a.b ++changes the convention to CR. That pattern matches "a\nb" because LF is no +longer a newline. If more than one of these settings is present, the last one +is used. + +
+The newline convention affects where the circumflex and dollar assertions are +true. It also affects the interpretation of the dot metacharacter when +PCRE2_DOTALL is not set, and the behaviour of \N when not followed by an +opening brace. However, it does not affect what the \R escape sequence +matches. By default, this is any Unicode newline sequence, for Perl +compatibility. However, this can be changed; see the next section and the +description of \R in the section entitled +"Newline sequences" +below. A change of \R setting can be combined with a change of newline +convention. +
++It is possible to restrict \R to match only CR, LF, or CRLF (instead of the +complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF +at compile time. This effect can also be achieved by starting a pattern with +(*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, +corresponding to PCRE2_BSR_UNICODE. +
++PCRE2 can be compiled to run in an environment that uses EBCDIC as its +character code instead of ASCII or Unicode (typically a mainframe system). In +the sections below, character code values are ASCII or Unicode; in an EBCDIC +environment these characters may have different code values, and there are no +code points greater than 255. +
++A regular expression is a pattern that is matched against a subject string from +left to right. Most characters stand for themselves in a pattern, and match the +corresponding characters in the subject. As a trivial example, the pattern +
+ The quick brown fox ++matches a portion of a subject string that is identical to itself. When +caseless matching is specified (the PCRE2_CASELESS option or (?i) within the +pattern), letters are matched independently of case. Note that there are two +ASCII characters, K and S, that, in addition to their lower case ASCII +equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F +(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the +PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to +pcre2_compile() or set by (?r) within the pattern). + +
+The power of regular expressions comes from the ability to include wild cards, +character classes, alternatives, and repetitions in the pattern. These are +encoded in the pattern by the use of metacharacters, which do not stand +for themselves but instead are interpreted in some special way. +
++There are two different sets of metacharacters: those that are recognized +anywhere in the pattern except within square brackets, and those that are +recognized within square brackets. Outside square brackets, the metacharacters +are as follows: +
+ \ general escape character with several uses
+ ^ assert start of string (or line, in multiline mode)
+ $ assert end of string (or line, in multiline mode)
+ . match any character except newline (by default)
+ [ start character class definition
+ | start of alternative branch
+ ( start group or control verb
+ ) end group or control verb
+ * 0 or more quantifier
+ + 1 or more quantifier; also "possessive quantifier"
+ ? 0 or 1 quantifier; also quantifier minimizer
+ { potential start of min/max quantifier
+
+Brace characters { and } are also used to enclose data for constructions such
+as \g{2} or \k{name}. In almost all uses of braces, space and/or horizontal
+tab characters that follow { or precede } are allowed and are ignored. In the
+case of quantifiers, they may also appear before or after the comma. The
+exception to this is \u{...} which is an ECMAScript compatibility feature
+that is recognized only when the PCRE2_EXTRA_ALT_BSUX option is set. ECMAScript
+does not ignore such white space; it causes the item to be interpreted as
+literal.
+
++Part of a pattern that is in square brackets is called a "character class". In +a character class the only metacharacters are: +
+ \ general escape character + ^ negate the class, but only if the first character + - indicates character range + [ POSIX character class (if followed by POSIX syntax) + ] terminates the character class ++If a pattern is compiled with the PCRE2_EXTENDED option, most white space in +the pattern, other than in a character class, within a \Q...\E sequence, or +between a # outside a character class and the next newline, inclusive, are +ignored. An escaping backslash can be used to include a white space or a # +character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the +same applies, but in addition unescaped space and horizontal tab characters are +ignored inside a character class. Note: only these two characters are ignored, +not the full set of pattern white space characters that are ignored outside a +character class. Option settings can be changed within a pattern; see the +section entitled +"Internal Option Setting" +below. + +
+The following sections describe the use of each of the metacharacters. +
++The backslash character has several uses. Firstly, if it is followed by a +character that is not a digit or a letter, it takes away any special meaning +that character may have. This use of backslash as an escape character applies +both inside and outside character classes. +
++For example, if you want to match a * character, you must write \* in the +pattern. This escaping action applies whether or not the following character +would otherwise be interpreted as a metacharacter, so it is always safe to +precede a non-alphanumeric with backslash to specify that it stands for itself. +In particular, if you want to match a backslash, you write \\. +
++Only ASCII digits and letters have any special meaning after a backslash. All +other characters (in particular, those whose code points are greater than 127) +are treated as literals. +
++If you want to treat all characters in a sequence as literals, you can do so by +putting them between \Q and \E. Note that this includes white space even when +the PCRE2_EXTENDED option is set so that most other white space is ignored. The +behaviour is different from Perl in that $ and @ are handled as literals in +\Q...\E sequences in PCRE2, whereas in Perl, $ and @ cause variable +interpolation. Also, Perl does "double-quotish backslash interpolation" on any +backslashes between \Q and \E which, its documentation says, "may lead to +confusing results". PCRE2 treats a backslash between \Q and \E just like any +other character. Note the following examples: +
+ Pattern PCRE2 matches Perl matches + + \Qabc$xyz\E abc$xyz abc followed by the contents of $xyz + \Qabc\$xyz\E abc\$xyz abc\$xyz + \Qabc\E\$\Qxyz\E abc$xyz abc$xyz + \QA\B\E A\B A\B + \Q\\E \ \\E ++The \Q...\E sequence is recognized both inside and outside character classes. +An isolated \E that is not preceded by \Q is ignored. If \Q is not followed +by \E later in the pattern, the literal interpretation continues to the end of +the pattern (that is, \E is assumed at the end). If the isolated \Q is inside +a character class, this causes an error, because the character class is then +not terminated by a closing square bracket. + +
+A second use of backslash provides a way of encoding non-printing characters +in patterns in a visible manner. There is no restriction on the appearance of +non-printing characters in a pattern, but when a pattern is being prepared by +text editing, it is often easier to use one of the following escape sequences +instead of the binary character it represents. In an ASCII or Unicode +environment, these escapes are as follows: +
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is a non-control ASCII character
+ \e escape (hex 1B)
+ \f form feed (hex 0C)
+ \n linefeed (hex 0A)
+ \r carriage return (hex 0D) (but see below)
+ \t tab (hex 09)
+ \0dd character with octal code 0dd
+ \ddd character with octal code ddd, or backreference
+ \o{ddd..} character with octal code ddd..
+ \xhh character with hex code hh
+ \x{hhh..} character with hex code hhh..
+ \N{U+hhh..} character with Unicode hex code point hhh..
+
+By default, after \x that is not followed by {, from zero to two hexadecimal
+digits are read (letters can be in upper or lower case). Any number of
+hexadecimal digits may appear between \x{ and }. If a character other than a
+hexadecimal digit appears between \x{ and }, or if there is no terminating },
+an error occurs.
+
++Characters whose code points are less than 256 can be defined by either of the +two syntaxes for \x or by an octal sequence. There is no difference in the way +they are handled. For example, \xdc is exactly the same as \x{dc} or \334. +However, using the braced versions does make such sequences easier to read. +
++Support is available for some ECMAScript (aka JavaScript) escape sequences via +two compile-time options. If PCRE2_ALT_BSUX is set, the sequence \x followed +by { is not recognized. Only if \x is followed by two hexadecimal digits is it +recognized as a character escape. Otherwise it is interpreted as a literal "x" +character. In this mode, support for code points greater than 256 is provided +by \u, which must be followed by four hexadecimal digits; otherwise it is +interpreted as a literal "u" character. +
++PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition, +\u{hhh..} is recognized as the character specified by hexadecimal code point. +There may be any number of hexadecimal digits, but unlike other places that +also use curly brackets, spaces are not allowed and would result in the string +being interpreted as a literal. This syntax is from ECMAScript 6. +
++The \N{U+hhh..} escape sequence is recognized only when PCRE2 is operating in +UTF mode. Perl also uses \N{name} to specify characters by Unicode name; PCRE2 +does not support this. Note that when \N is not followed by an opening brace +(curly bracket) it has an entirely different meaning, matching any character +that is not a newline. +
++There are some legacy applications where the escape sequence \r is expected to +match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \r in a +pattern is converted to \n so that it matches a LF (linefeed) instead of a CR +(carriage return) character. +
++An error occurs if \c is not followed by a character whose ASCII code point +is in the range 32 to 126. The precise effect of \cx is as follows: if x is a +lower case letter, it is converted to upper case. Then bit 6 of the character +(hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A (A is 41, Z is +5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes hex 7B (; is 3B). If +the code unit following \c has a code point less than 32 or greater than 126, +a compile-time error occurs. +
++When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, +\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c +escape is processed as specified for Perl in the perlebcdic document. The +only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], +^, _, or ?. Any other character provokes a compile-time error. The sequence +\c@ encodes character code 0; after \c the letters (in either case) encode +characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 +(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +
++Thus, apart from \c?, these escapes generate the same character code values as +they do in an ASCII environment, though the meanings of the values mostly +differ. For example, \cG always generates code value 7, which is BEL in ASCII +but DEL in EBCDIC. +
++The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but +because 127 is not a control character in EBCDIC, Perl makes it generate the +APC character. Unfortunately, there are several variants of EBCDIC. In most of +them the APC character has the value 255 (hex FF), but in the one Perl calls +POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC +values, PCRE2 makes \c? generate 95; otherwise it generates 255. +
++After \0 up to two further octal digits are read. If there are fewer than two +digits, just those that are present are used. Thus the sequence \0\x\015 +specifies two binary zeros followed by a CR character (code value 13). Make +sure you supply two digits after the initial zero if the pattern character that +follows is itself an octal digit. +
++The escape \o must be followed by a sequence of octal digits, enclosed in +braces. An error occurs if this is not the case. This escape is a recent +addition to Perl; it provides way of specifying character code points as octal +numbers greater than 0777, and it also allows octal numbers and backreferences +to be unambiguously specified. +
++For greater clarity and unambiguity, it is best to avoid following \ by a +digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical +character code points, and \g{...} to specify backreferences. The following +paragraphs describe the old, ambiguous syntax. +
++The handling of a backslash followed by a digit other than 0 is complicated, +and Perl has changed over time, causing PCRE2 also to change. +
++Outside a character class, PCRE2 reads the digit and any following digits as a +decimal number. If the number is less than 10, begins with the digit 8 or 9, or +if there are at least that many previous capture groups in the expression, the +entire sequence is taken as a backreference. A description of how this +works is given +later, +following the discussion of +parenthesized groups. +Otherwise, up to three octal digits are read to form a character code. +
++Inside a character class, PCRE2 handles \8 and \9 as the literal characters +"8" and "9", and otherwise reads up to three octal digits following the +backslash, using them to generate a data character. Any subsequent digits stand +for themselves. For example, outside a character class: +
+ \040 is another way of writing an ASCII space + \40 is the same, provided there are fewer than 40 previous capture groups + \7 is always a backreference + \11 might be a backreference, or another way of writing a tab + \011 is always a tab + \0113 is a tab followed by the character "3" + \113 might be a backreference, otherwise the character with octal code 113 + \377 might be a backreference, otherwise the value 255 (decimal) + \81 is always a backreference ++Note that octal values of 100 or greater that are specified using this syntax +must not be introduced by a leading zero, because no more than three octal +digits are ever read. + +
+Characters that are specified using octal or hexadecimal numbers are +limited to certain values, as follows: +
+ 8-bit non-UTF mode no greater than 0xff + 16-bit non-UTF mode no greater than 0xffff + 32-bit non-UTF mode no greater than 0xffffffff + All UTF modes no greater than 0x10ffff and a valid code point ++Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the +so-called "surrogate" code points). The check for these can be disabled by the +caller of pcre2_compile() by setting the option +PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8 +and UTF-32 modes, because these values are not representable in UTF-16. + +
+All the sequences that define a single character value can be used both inside +and outside character classes. In addition, inside a character class, \b is +interpreted as the backspace character (hex 08). +
++When not followed by an opening brace, \N is not allowed in a character class. +\B, \R, and \X are not special inside a character class. Like other +unrecognized alphabetic escape sequences, they cause an error. Outside a +character class, these sequences have different meanings. +
++In Perl, the sequences \F, \l, \L, \u, and \U are recognized by its string +handler and used to modify the case of following characters. By default, PCRE2 +does not support these escape sequences in patterns. However, if either of the +PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U matches a "U" +character, and \u can be used to define a character by code point, as +described above. +
++The sequence \g followed by a signed or unsigned number, optionally enclosed +in braces, is an absolute or relative backreference. A named backreference +can be coded as \g{name}. Backreferences are discussed +later, +following the discussion of +parenthesized groups. +
++For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or +a number enclosed either in angle brackets or single quotes, is an alternative +syntax for referencing a capture group as a subroutine. Details are discussed +later. +Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not +synonymous. The former is a backreference; the latter is a +subroutine +call. +
++Another use of backslash is for specifying generic character types: +
+ \d any decimal digit + \D any character that is not a decimal digit + \h any horizontal white space character + \H any character that is not a horizontal white space character + \N any character that is not a newline + \s any white space character + \S any character that is not a white space character + \v any vertical white space character + \V any character that is not a vertical white space character + \w any "word" character + \W any "non-word" character ++The \N escape sequence has the same meaning as +the "." metacharacter +when PCRE2_DOTALL is not set, but setting PCRE2_DOTALL does not change the +meaning of \N. Note that when \N is followed by an opening brace it has a +different meaning. See the section entitled +"Non-printing characters" +above for details. Perl also uses \N{name} to specify characters by Unicode +name; PCRE2 does not support this. + +
+Each pair of lower and upper case escape sequences partitions the complete set +of characters into two disjoint sets. Any given character matches one, and only +one, of each pair. The sequences can appear both inside and outside character +classes. They each match one character of the appropriate type. If the current +matching point is at the end of the subject string, all of them fail, because +there is no character to match. +
++The default \s characters are HT (9), LF (10), VT (11), FF (12), CR (13), and +space (32), which are defined as white space in the "C" locale. This list may +vary if locale-specific matching is taking place. For example, in some locales +the "non-breaking space" character (\xA0) is recognized as white space, and in +others the VT character is not. +
++A "word" character is an underscore or any character that is a letter or digit. +By default, the definition of letters and digits is controlled by PCRE2's +low-valued character tables, and may vary if locale-specific matching is taking +place (see +"Locale support" +in the +pcre2api +page). For example, in a French locale such as "fr_FR" in Unix-like systems, +or "french" in Windows, some character codes greater than 127 are used for +accented letters, and these are then matched by \w. The use of locales with +Unicode is discouraged. +
++By default, characters whose code points are greater than 127 never match \d, +\s, or \w, and always match \D, \S, and \W, although this may be different +for characters in the range 128-255 when locale-specific matching is happening. +These escape sequences retain their original meanings from before Unicode +support was available, mainly for efficiency reasons. If the PCRE2_UCP option +is set, the behaviour is changed so that Unicode properties are used to +determine character types, as follows: +
+ \d any character that matches \p{Nd} (decimal digit)
+ \s any character that matches \p{Z} or \h or \v
+ \w any character that matches \p{L}, \p{N}, \p{Mn}, or \p{Pc}
+
+The addition of \p{Mn} (non-spacing mark) and the replacement of an explicit
+test for underscore with a test for \p{Pc} (connector punctuation) happened in
+PCRE2 release 10.43. This brings PCRE2 into line with Perl.
+
++The upper case escapes match the inverse sets of characters. Note that \d +matches only decimal digits, whereas \w matches any Unicode digit, as well as +other character categories. Note also that PCRE2_UCP affects \b, and +\B because they are defined in terms of \w and \W. Matching these sequences +is noticeably slower when PCRE2_UCP is set. +
++The effect of PCRE2_UCP on any one of these escape sequences can be negated by +the options PCRE2_EXTRA_ASCII_BSD, PCRE2_EXTRA_ASCII_BSS, and +PCRE2_EXTRA_ASCII_BSW, respectively. These options can be set and reset within +a pattern by means of an internal option setting +(see below). +
++The sequences \h, \H, \v, and \V, in contrast to the other sequences, which +match only ASCII characters by default, always match a specific list of code +points, whether or not PCRE2_UCP is set. The horizontal space characters are: +
+ U+0009 Horizontal tab (HT) + U+0020 Space + U+00A0 Non-break space + U+1680 Ogham space mark + U+180E Mongolian vowel separator + U+2000 En quad + U+2001 Em quad + U+2002 En space + U+2003 Em space + U+2004 Three-per-em space + U+2005 Four-per-em space + U+2006 Six-per-em space + U+2007 Figure space + U+2008 Punctuation space + U+2009 Thin space + U+200A Hair space + U+202F Narrow no-break space + U+205F Medium mathematical space + U+3000 Ideographic space ++The vertical space characters are: +
+ U+000A Linefeed (LF) + U+000B Vertical tab (VT) + U+000C Form feed (FF) + U+000D Carriage return (CR) + U+0085 Next line (NEL) + U+2028 Line separator + U+2029 Paragraph separator ++In 8-bit, non-UTF-8 mode, only the characters with code points less than 256 +are relevant. + +
+Outside a character class, by default, the escape sequence \R matches any +Unicode newline sequence. In 8-bit non-UTF-8 mode \R is equivalent to the +following: +
+ (?>\r\n|\n|\x0b|\f|\r|\x85) ++This is an example of an "atomic group", details of which are given +below. +This particular group matches either the two-character sequence CR followed by +LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, +U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next +line, U+0085). Because this is an atomic group, the two-character sequence is +treated as a single unit that cannot be split. + +
+In other modes, two additional characters whose code points are greater than 255 +are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029). +Unicode support is not needed for these characters to be recognized. +
++It is possible to restrict \R to match only CR, LF, or CRLF (instead of the +complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF +at compile time. (BSR is an abbreviation for "backslash R".) This can be made +the default when PCRE2 is built; if this is the case, the other behaviour can +be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify +these settings by starting a pattern string with one of the following +sequences: +
+ (*BSR_ANYCRLF) CR, LF, or CRLF only + (*BSR_UNICODE) any Unicode newline sequence ++These override the default and the options given to the compiling function. +Note that these special settings, which are not Perl-compatible, are recognized +only at the very start of a pattern, and that they must be in upper case. If +more than one of them is present, the last one is used. They can be combined +with a change of newline convention; for example, a pattern can start with: +
+ (*ANY)(*BSR_ANYCRLF) ++They can also be combined with the (*UTF) or (*UCP) special sequences. Inside a +character class, \R is treated as an unrecognized escape sequence, and causes +an error. + +
+When PCRE2 is built with Unicode support (the default), three additional escape +sequences that match characters with specific properties are available. They +can be used in any mode, though in 8-bit and 16-bit non-UTF modes these +sequences are of course limited to testing characters whose code points are +less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points +greater than 0x10ffff (the Unicode limit) may be encountered. These are all +treated as being in the Unknown script and with an unassigned type. +
++Matching characters by Unicode property is not fast, because PCRE2 has to do a +multistage table lookup in order to find a character's property. That is why +the traditional escape sequences such as \d and \w do not use Unicode +properties in PCRE2 by default, though you can make them do so by setting the +PCRE2_UCP option or by starting the pattern with (*UCP). +
++The extra escape sequences that provide property support are: +
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \X a Unicode extended grapheme cluster
+
+The property names represented by xx above are not case-sensitive, and in
+accordance with Unicode's "loose matching" rules, spaces, hyphens, and
+underscores are ignored. There is support for Unicode script names, Unicode
+general category properties, "Any", which matches any character (including
+newline), Bidi_Class, a number of binary (yes/no) properties, and some special
+PCRE2 properties (described
+below).
+Certain other Perl properties such as "InMusicalSymbols" are not supported by
+PCRE2. Note that \P{Any} does not match any characters, so always causes a
+match failure.
+
++There are three different syntax forms for matching a script. Each Unicode +character has a basic script and, optionally, a list of other scripts ("Script +Extensions") with which it is commonly used. Using the Adlam script as an +example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas +\p{scx:Adlam} matches, in addition, characters that have Adlam in their +extensions list. The full names "script" and "script extensions" for the +property types are recognized, and a equals sign is an alternative to the +colon. If a script name is given without a property type, for example, +\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this +interpretation at release 5.26 and PCRE2 changed at release 10.40. +
++Unassigned characters (and in non-UTF 32-bit mode, characters with code points +greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not +part of an identified script are lumped together as "Common". The current list +of recognized script names and their 4-character abbreviations can be obtained +by running this command: +
+ pcre2test -LS + ++ +
+Each character has exactly one Unicode general category property, specified by +a two-letter abbreviation. For compatibility with Perl, negation can be +specified by including a circumflex between the opening brace and the property +name. For example, \p{^Lu} is the same as \P{Lu}. +
++If only one letter is specified with \p or \P, it includes all the general +category properties that start with that letter. In this case, in the absence +of negation, the curly brackets in the escape sequence are optional; these two +examples have the same effect: +
+ \p{L}
+ \pL
+
+The following general category property codes are supported:
++ C Other + Cc Control + Cf Format + Cn Unassigned + Co Private use + Cs Surrogate + + L Letter + Ll Lower case letter + Lm Modifier letter + Lo Other letter + Lt Title case letter + Lu Upper case letter + + M Mark + Mc Spacing mark + Me Enclosing mark + Mn Non-spacing mark + + N Number + Nd Decimal number + Nl Letter number + No Other number + + P Punctuation + Pc Connector punctuation + Pd Dash punctuation + Pe Close punctuation + Pf Final punctuation + Pi Initial punctuation + Po Other punctuation + Ps Open punctuation + + S Symbol + Sc Currency symbol + Sk Modifier symbol + Sm Mathematical symbol + So Other symbol + + Z Separator + Zl Line separator + Zp Paragraph separator + Zs Space separator ++The special property LC, which has the synonym L&, is also supported: it +matches a character that has the Lu, Ll, or Lt property, in other words, a +letter that is not classified as a modifier or "other". + +
+The Cs (Surrogate) property applies only to characters whose code points are in +the range U+D800 to U+DFFF. These characters are no different to any other +character when PCRE2 is not in UTF mode (using the 16-bit or 32-bit library). +However, they are not valid in Unicode strings and so cannot be tested by PCRE2 +in UTF mode, unless UTF validity checking has been turned off (see the +discussion of PCRE2_NO_UTF_CHECK in the +pcre2api +page). +
++The long synonyms for property names that Perl supports (such as \p{Letter}) +are not supported by PCRE2, nor is it permitted to prefix any of these +properties with "Is". +
++No character that is in the Unicode table has the Cn (unassigned) property. +Instead, this property is assumed for any code point that is not in the +Unicode table. +
++Specifying caseless matching does not affect these escape sequences. For +example, \p{Lu} always matches only upper case letters. This is different from +the behaviour of current versions of Perl. +
++Unicode defines a number of binary properties, that is, properties whose only +values are true or false. You can obtain a list of those that are recognized by +\p and \P, along with their abbreviations, by running this command: +
+ pcre2test -LP + ++ +
+
+ \p{Bidi_Class:<class>} matches a character with the given class
+ \p{BC:<class>} matches a character with the given class
+
+The recognized classes are:
++ AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space ++An equals sign may be used instead of a colon. The class names are +case-insensitive; only the short names listed above are recognized. + +
+The \X escape matches any number of Unicode characters that form an "extended +grapheme cluster", and treats the sequence as an atomic group +(see below). +Unicode supports various kinds of composite character by giving each character +a grapheme breaking property, and having rules that use these properties to +define the boundaries of extended grapheme clusters. The rules are defined in +Unicode Standard Annex 29, "Unicode Text Segmentation". Unicode 11.0.0 +abandoned the use of some previous properties that had been used for emojis. +Instead it introduced various emoji-specific properties. PCRE2 uses only the +Extended Pictographic property. +
++\X always matches at least one character. Then it decides whether to add +additional characters according to the following rules for ending a cluster: +
++1. End at the end of the subject string. +
++2. Do not end between CR and LF; otherwise end after any control character. +
++3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters +are of five types: L, V, T, LV, and LVT. An L character may be followed by an +L, V, LV, or LVT character; an LV or V character may be followed by a V or T +character; an LVT or T character may be followed only by a T character. +
++4. Do not end before extending characters or spacing marks or the zero-width +joiner (ZWJ) character. Characters with the "mark" property always have the +"extend" grapheme breaking property. +
++5. Do not end after prepend characters. +
++6. Do not end within emoji modifier sequences or emoji ZWJ (zero-width +joiner) sequences. An emoji ZWJ sequence consists of a character with the +Extended_Pictographic property, optionally followed by one or more characters +with the Extend property, followed by the ZWJ character, followed by another +Extended_Pictographic character. +
++7. Do not break within emoji flag sequences. That is, do not break between +regional indicator (RI) characters if there are an odd number of RI characters +before the break point. +
++8. Otherwise, end the cluster. +
++As well as the standard Unicode properties described above, PCRE2 supports four +more that make it possible to convert traditional escape sequences such as \w +and \s to use Unicode properties. PCRE2 uses these non-standard, non-Perl +properties internally when PCRE2_UCP is set. However, they may also be used +explicitly. These properties are: +
+ Xan Any alphanumeric character + Xps Any POSIX space character + Xsp Any Perl space character + Xwd Any Perl "word" character ++Xan matches characters that have either the L (letter) or the N (number) +property. Xps matches the characters tab, linefeed, vertical tab, form feed, or +carriage return, and any other character that has the Z (separator) property. +Xsp is the same as Xps; in PCRE1 it used to exclude vertical tab, for Perl +compatibility, but Perl changed. Xwd matches the same characters as Xan, plus +those that match Mn (non-spacing mark) or Pc (connector punctuation, which +includes underscore). + +
+There is another non-standard property, Xuc, which matches any character that +can be represented by a Universal Character Name in C++ and other programming +languages. These are the characters $, @, ` (grave accent), and all characters +with Unicode code points greater than or equal to U+00A0, except for the +surrogates U+D800 to U+DFFF. Note that most base (ASCII) characters are +excluded. (Universal Character Names are of the form \uHHHH or \UHHHHHHHH +where H is a hexadecimal digit. Note that the Xuc property does not match these +sequences but the characters that they represent.) +
++In normal use, the escape sequence \K causes any previously matched characters +not to be included in the final matched sequence that is returned. For example, +the pattern: +
+ foo\Kbar ++matches "foobar", but reports that it has matched "bar". \K does not interact +with anchoring in any way. The pattern: +
+ ^foo\Kbar ++matches only when the subject begins with "foobar" (in single line mode), +though it again reports the matched string as "bar". This feature is similar to +a lookbehind assertion +(described below), +but the part of the pattern that precedes \K is not constrained to match a +limited number of characters, as is required for a lookbehind assertion. The +use of \K does not interfere with the setting of +captured substrings. +For example, when the pattern +
+ (foo)\Kbar ++matches "foobar", the first substring is still set to "foo". + +
+From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From +release 10.38 PCRE2 also forbids this by default. However, the +PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling +pcre2_compile() to re-enable the previous behaviour. When this option is +set, \K is acted upon when it occurs inside positive assertions, but is +ignored in negative assertions. Note that when a pattern such as (?=ab\K) +matches, the reported start of the match can be greater than the end of the +match. Using \K in a lookbehind assertion at the start of a pattern can also +lead to odd effects. For example, consider this pattern: +
+ (?<=\Kfoo)bar ++If the subject is "foobar", a call to pcre2_match() with a starting +offset of 3 succeeds and reports the matching string as "foobar", that is, the +start of the reported match is earlier than where the match started. + +
+The final use of backslash is for certain simple assertions. An assertion +specifies a condition that has to be met at a particular point in a match, +without consuming any characters from the subject string. The use of +groups for more complicated assertions is described +below. +The backslashed assertions are: +
+ \b matches at a word boundary + \B matches when not at a word boundary + \A matches at the start of the subject + \Z matches at the end of the subject + also matches before a newline at the end of the subject + \z matches only at the end of the subject + \G matches at the first matching position in the subject ++Inside a character class, \b has a different meaning; it matches the backspace +character. If any other of these assertions appears in a character class, an +"invalid escape sequence" error is generated. + +
+A word boundary is a position in the subject string where the current character +and the previous character do not both match \w or \W (i.e. one matches +\w and the other matches \W), or the start or end of the string if the +first or last character matches \w, respectively. When PCRE2 is built with +Unicode support, the meanings of \w and \W can be changed by setting the +PCRE2_UCP option. When this is done, it also affects \b and \B. Neither PCRE2 +nor Perl has a separate "start of word" or "end of word" metasequence. However, +whatever follows \b normally determines which it is. For example, the fragment +\ba matches "a" at the start of a word. +
++The \A, \Z, and \z assertions differ from the traditional circumflex and +dollar (described in the next section) in that they only ever match at the very +start and end of the subject string, whatever options are set. Thus, they are +independent of multiline mode. These three assertions are not affected by the +PCRE2_NOTBOL or PCRE2_NOTEOL options, which affect only the behaviour of the +circumflex and dollar metacharacters. However, if the startoffset +argument of pcre2_match() is non-zero, indicating that matching is to +start at a point other than the beginning of the subject, \A can never match. +The difference between \Z and \z is that \Z matches before a newline at the +end of the string as well as at the very end, whereas \z matches only at the +end. +
++The \G assertion is true only when the current matching position is at the +start point of the matching process, as specified by the startoffset +argument of pcre2_match(). It differs from \A when the value of +startoffset is non-zero. By calling pcre2_match() multiple times +with appropriate arguments, you can mimic Perl's /g option, and it is in this +kind of implementation where \G can be useful. +
++Note, however, that PCRE2's implementation of \G, being true at the starting +character of the matching process, is subtly different from Perl's, which +defines it as true at the end of the previous match. In Perl, these can be +different when the previously matched string was empty. Because PCRE2 does just +one match at a time, it cannot reproduce this behaviour. +
++If all the alternatives of a pattern begin with \G, the expression is anchored +to the starting match position, and the "anchored" flag is set in the compiled +regular expression. +
++The circumflex and dollar metacharacters are zero-width assertions. That is, +they test for a particular condition being true without consuming any +characters from the subject string. These two metacharacters are concerned with +matching the starts and ends of lines. If the newline convention is set so that +only the two-character sequence CRLF is recognized as a newline, isolated CR +and LF characters are treated as ordinary data characters, and are not +recognized as newlines. +
++Outside a character class, in the default matching mode, the circumflex +character is an assertion that is true only if the current matching point is at +the start of the subject string. If the startoffset argument of +pcre2_match() is non-zero, or if PCRE2_NOTBOL is set, circumflex can +never match if the PCRE2_MULTILINE option is unset. Inside a character class, +circumflex has an entirely different meaning +(see below). +
++Circumflex need not be the first character of the pattern if a number of +alternatives are involved, but it should be the first thing in each alternative +in which it appears if the pattern is ever to match that branch. If all +possible alternatives start with a circumflex, that is, if the pattern is +constrained to match only at the start of the subject, it is said to be an +"anchored" pattern. (There are also other constructs that can cause a pattern +to be anchored.) +
++The dollar character is an assertion that is true only if the current matching +point is at the end of the subject string, or immediately before a newline at +the end of the string (by default), unless PCRE2_NOTEOL is set. Note, however, +that it does not actually match the newline. Dollar need not be the last +character of the pattern if a number of alternatives are involved, but it +should be the last item in any branch in which it appears. Dollar has no +special meaning in a character class. +
++The meaning of dollar can be changed so that it matches only at the very end of +the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This +does not affect the \Z assertion. +
++The meanings of the circumflex and dollar metacharacters are changed if the +PCRE2_MULTILINE option is set. When this is the case, a dollar character +matches before any newlines in the string, as well as at the very end, and a +circumflex matches immediately after internal newlines as well as at the start +of the subject string. It does not match after a newline that ends the string, +for compatibility with Perl. However, this can be changed by setting the +PCRE2_ALT_CIRCUMFLEX option. +
++For example, the pattern /^abc$/ matches the subject string "def\nabc" (where +\n represents a newline) in multiline mode, but not otherwise. Consequently, +patterns that are anchored in single line mode because all branches start with +^ are not anchored in multiline mode, and a match for circumflex is possible +when the startoffset argument of pcre2_match() is non-zero. The +PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. +
++When the newline convention (see +"Newline conventions" +below) recognizes the two-character sequence CRLF as a newline, this is +preferred, even if the single characters CR and LF are also recognized as +newlines. For example, if the newline convention is "any", a multiline mode +circumflex matches before "xyz" in the string "abc\r\nxyz" rather than after +CR, even though CR on its own is a valid newline. (It also matches at the very +start of the string, of course.) +
++Note that the sequences \A, \Z, and \z can be used to match the start and +end of the subject in both modes, and if all branches of a pattern start with +\A it is always anchored, whether or not PCRE2_MULTILINE is set. +
++Outside a character class, a dot in the pattern matches any one character in +the subject string except (by default) a character that signifies the end of a +line. One or more characters may be specified as line terminators (see +"Newline conventions" +above). +
++Dot never matches a single line-ending character. When the two-character +sequence CRLF is the only line ending, dot does not match CR if it is +immediately followed by LF, but otherwise it matches all characters (including +isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurrences +of CR of LF match dot. When all Unicode line endings are being recognized, dot +does not match CR or LF or any of the other line ending characters. +
++The behaviour of dot with regard to newlines can be changed. If the +PCRE2_DOTALL option is set, a dot matches any one character, without exception. +If the two-character sequence CRLF is present in the subject string, it takes +two dots to match it. +
++The handling of dot is entirely independent of the handling of circumflex and +dollar, the only relationship being that they both involve newlines. Dot has no +special meaning in a character class. +
++The escape sequence \N when not followed by an opening brace behaves like a +dot, except that it is not affected by the PCRE2_DOTALL option. In other words, +it matches any character except one that signifies the end of a line. +
++When \N is followed by an opening brace it has a different meaning. See the +section entitled +"Non-printing characters" +above for details. Perl also uses \N{name} to specify characters by Unicode +name; PCRE2 does not support this. +
++Outside a character class, the escape sequence \C matches any one code unit, +whether or not a UTF mode is set. In the 8-bit library, one code unit is one +byte; in the 16-bit library it is a 16-bit unit; in the 32-bit library it is a +32-bit unit. Unlike a dot, \C always matches line-ending characters. The +feature is provided in Perl in order to match individual bytes in UTF-8 mode, +but it is unclear how it can usefully be used. +
++Because \C breaks up characters into individual code units, matching one unit +with \C in UTF-8 or UTF-16 mode means that the rest of the string may start +with a malformed UTF character. This has undefined results, because PCRE2 +assumes that it is matching character by character in a valid UTF string (by +default it checks the subject string's validity at the start of processing +unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used). +
++An application can lock out the use of \C by setting the +PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to +build PCRE2 with the use of \C permanently disabled. +
++PCRE2 does not allow \C to appear in lookbehind assertions +(described below) +in UTF-8 or UTF-16 modes, because this would make it impossible to calculate +the length of the lookbehind. Neither the alternative matching function +pcre2_dfa_match() nor the JIT optimizer support \C in these UTF modes. +The former gives a match-time error; the latter fails to optimize and so the +match is always run using the interpreter. +
++In the 32-bit library, however, \C is always supported (when not explicitly +locked out) because it always matches a single code unit, whether or not UTF-32 +is specified. +
++In general, the \C escape sequence is best avoided. However, one way of using +it that avoids the problem of malformed UTF-8 or UTF-16 characters is to use a +lookahead to check the length of the next character, as in this pattern, which +could be used with a UTF-8 string (ignore white space and line breaks): +
+ (?| (?=[\x00-\x7f])(\C) |
+ (?=[\x80-\x{7ff}])(\C)(\C) |
+ (?=[\x{800}-\x{ffff}])(\C)(\C)(\C) |
+ (?=[\x{10000}-\x{1fffff}])(\C)(\C)(\C)(\C))
+
+In this example, a group that starts with (?| resets the capturing parentheses
+numbers in each alternative (see
+"Duplicate Group Numbers"
+below). The assertions at the start of each branch check the next UTF-8
+character for values whose encoding uses 1, 2, 3, or 4 bytes, respectively. The
+character's individual bytes are then captured by the appropriate number of
+\C groups.
+
++An opening square bracket introduces a character class, terminated by a closing +square bracket. A closing square bracket on its own is not special by default. +If a closing square bracket is required as a member of the class, it should be +the first data character in the class (after an initial circumflex, if present) +or escaped with a backslash. This means that, by default, an empty class cannot +be defined. However, if the PCRE2_ALLOW_EMPTY_CLASS option is set, a closing +square bracket at the start does end the (empty) class. +
++A character class matches a single character in the subject. A matched +character must be in the set of characters defined by the class, unless the +first character in the class definition is a circumflex, in which case the +subject character must not be in the set defined by the class. If a circumflex +is actually required as a member of the class, ensure it is not the first +character, or escape it with a backslash. +
++For example, the character class [aeiou] matches any lower case vowel, while +[^aeiou] matches any character that is not a lower case vowel. Note that a +circumflex is just a convenient notation for specifying the characters that +are in the class by enumerating those that are not. A class that starts with a +circumflex is not an assertion; it still consumes a character from the subject +string, and therefore it fails if the current pointer is at the end of the +string. +
++Characters in a class may be specified by their code points using \o, \x, or +\N{U+hh..} in the usual way. When caseless matching is set, any letters in a +class represent both their upper case and lower case versions, so for example, +a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not +match "A", whereas a caseful version would. Note that there are two ASCII +characters, K and S, that, in addition to their lower case ASCII equivalents, +are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) +respectively when either PCRE2_UTF or PCRE2_UCP is set. +
++Characters that might indicate line breaks are never treated in any special way +when matching character classes, whatever line-ending sequence is in use, and +whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A +class such as [^a] always matches one of these characters. +
++The generic character type escape sequences \d, \D, \h, \H, \p, \P, \s, +\S, \v, \V, \w, and \W may appear in a character class, and add the +characters that they match to the class. For example, [\dABCDEF] matches any +hexadecimal digit. In UTF modes, the PCRE2_UCP option affects the meanings of +\d, \s, \w and their upper case partners, just as it does when they appear +outside a character class, as described in the section entitled +"Generic character types" +above. The escape sequence \b has a different meaning inside a character +class; it matches the backspace character. The sequences \B, \R, and \X are +not special inside a character class. Like any other unrecognized escape +sequences, they cause an error. The same is true for \N when not followed by +an opening brace. +
++The minus (hyphen) character can be used to specify a range of characters in a +character class. For example, [d-m] matches any letter between d and m, +inclusive. If a minus character is required in a class, it must be escaped with +a backslash or appear in a position where it cannot be interpreted as +indicating a range, typically as the first or last character in the class, +or immediately after a range. For example, [b-d-z] matches letters in the range +b to d, a hyphen character, or z. +
++Perl treats a hyphen as a literal if it appears before or after a POSIX class +(see below) or before or after a character type escape such as \d or \H. +However, unless the hyphen is the last character in the class, Perl outputs a +warning in its warning mode, as this is most likely a user error. As PCRE2 has +no facility for warning, an error is given in these cases. +
++It is not possible to have the literal character "]" as the end character of a +range. A pattern such as [W-]46] is interpreted as a class of two characters +("W" and "-") followed by a literal string "46]", so it would match "W46]" or +"-46]". However, if the "]" is escaped with a backslash it is interpreted as +the end of range, so [W-\]46] is interpreted as a class containing a range +followed by two other characters. The octal or hexadecimal representation of +"]" can also be used to end a range. +
++Ranges normally include all code points between the start and end characters, +inclusive. They can also be used for code points specified numerically, for +example [\000-\037]. Ranges can include any characters that are valid for the +current mode. In any UTF mode, the so-called "surrogate" characters (those +whose code points lie between 0xd800 and 0xdfff inclusive) may not be specified +explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables +this check). However, ranges such as [\x{d7ff}-\x{e000}], which include the +surrogates, are always permitted. +
++There is a special case in EBCDIC environments for ranges whose end points are +both specified as literal letters in the same case. For compatibility with +Perl, EBCDIC code points within the range that are not letters are omitted. For +example, [h-k] matches only four characters, even though the codes for h and k +are 0x88 and 0x92, a range of 11 code points. However, if the range is +specified numerically, for example, [\x88-\x92] or [h-\x92], all code points +are included. +
++If a range that includes letters is used when caseless matching is set, it +matches the letters in either case. For example, [W-c] is equivalent to +[][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character +tables for a French locale are in use, [\xc8-\xcb] matches accented E +characters in both cases. +
++A circumflex can conveniently be used with the upper case character types to +specify a more restricted set of characters than the matching lower case type. +For example, the class [^\W_] matches any letter or digit, but not underscore, +whereas [\w] includes underscore. A positive character class should be read as +"something OR something OR ..." and a negative class as "NOT something AND NOT +something AND NOT ...". +
++The only metacharacters that are recognized in character classes are backslash, +hyphen (only where it can be interpreted as specifying a range), circumflex +(only at the start), opening square bracket (only when it can be interpreted as +introducing a POSIX class name, or for a special compatibility feature - see +the next two sections), and the terminating closing square bracket. However, +escaping other non-alphanumeric characters does no harm. +
++Perl supports the POSIX notation for character classes. This uses names +enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports +this notation. For example, +
+ [01[:alpha:]%] ++matches "0", "1", any alphabetic character, or "%". The supported class names +are: +
+ alnum letters and digits + alpha letters + ascii character codes 0 - 127 + blank space or tab only + cntrl control characters + digit decimal digits (same as \d) + graph printing characters, excluding space + lower lower case letters + print printing characters, including space + punct printing characters, excluding letters and digits and space + space white space (the same as \s from PCRE2 8.34) + upper upper case letters + word "word" characters (same as \w) + xdigit hexadecimal digits ++The default "space" characters are HT (9), LF (10), VT (11), FF (12), CR (13), +and space (32). If locale-specific matching is taking place, the list of space +characters may be different; there may be fewer or more of them. "Space" and +\s match the same set of characters, as do "word" and \w. + +
+The name "word" is a Perl extension, and "blank" is a GNU extension from Perl +5.8. Another Perl extension is negation, which is indicated by a ^ character +after the colon. For example, +
+ [12[:^digit:]] ++matches "1", "2", or any non-digit. PCRE2 (and Perl) also recognize the POSIX +syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not +supported, and an error is given if they are encountered. + +
+By default, characters with values greater than 127 do not match any of the +POSIX character classes, although this may be different for characters in the +range 128-255 when locale-specific matching is happening. However, in UCP mode, +unless certain options are set (see below), some of the classes are changed so +that Unicode character properties are used. This is achieved by replacing +POSIX classes with other sequences, as follows: +
+ [:alnum:] becomes \p{Xan}
+ [:alpha:] becomes \p{L}
+ [:blank:] becomes \h
+ [:cntrl:] becomes \p{Cc}
+ [:digit:] becomes \p{Nd}
+ [:lower:] becomes \p{Ll}
+ [:space:] becomes \p{Xps}
+ [:upper:] becomes \p{Lu}
+ [:word:] becomes \p{Xwd}
+
+Negated versions, such as [:^alpha:] use \P instead of \p. Four other POSIX
+classes are handled specially in UCP mode:
+
++[:graph:] +This matches characters that have glyphs that mark the page when printed. In +Unicode property terms, it matches all characters with the L, M, N, P, S, or Cf +properties, except for: +
+ U+061C Arabic Letter Mark + U+180E Mongolian Vowel Separator + U+2066 - U+2069 Various "isolate"s + ++ +
+[:print:] +This matches the same characters as [:graph:] plus space characters that are +not controls, that is, characters with the Zs property. +
++[:punct:] +This matches all characters that have the Unicode P (punctuation) property, +plus those characters with code points less than 256 that have the S (Symbol) +property. +
++[:xdigit:] +In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" +versions of those characters, whose Unicode code points start at U+FF10. This +is a change that was made in PCRE release 10.43 for Perl compatibility. +
++The other POSIX classes are unchanged by PCRE2_UCP, and match only characters +with code points less than 256. +
++There are two options that can be used to restrict the POSIX classes to ASCII +characters when PCRE2_UCP is set. The option PCRE2_EXTRA_ASCII_DIGIT affects +just [:digit:] and [:xdigit:]. Within a pattern, this can be set and unset by +(?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing +for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, +(?aP) and (?-aP) set and unset both these options for consistency. +
++In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly +syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of +word". PCRE2 treats these items as follows: +
+ [[:<:]] is converted to \b(?=\w) + [[:>:]] is converted to \b(?<=\w) ++Only these exact character sequences are recognized. A sequence such as +[a[:<:]b] provokes error for an unrecognized POSIX class name. This support is +not compatible with Perl. It is provided to help migrations from other +environments, and is best not used in any new patterns. Note that \b matches +at the start and the end of a word (see +"Simple assertions" +above), and in a Perl-style pattern the preceding or following character +normally shows which is wanted, without the need for the assertions that are +used above in order to give exactly the POSIX behaviour. Note also that the +PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so +it also affects these POSIX sequences. + +
+Vertical bar characters are used to separate alternative patterns. For example, +the pattern +
+ gilbert|sullivan ++matches either "gilbert" or "sullivan". Any number of alternatives may appear, +and an empty alternative is permitted (matching the empty string). The matching +process tries each alternative in turn, from left to right, and the first one +that succeeds is used. If the alternatives are within a group +(defined below), +"succeeds" means matching the rest of the main pattern as well as the +alternative in the group. + +
+The settings of several options can be changed within a pattern by a sequence +of letters enclosed between "(?" and ")". The following are Perl-compatible, +and are described in detail in the +pcre2api +documentation. The option letters are: +
+ i for PCRE2_CASELESS + m for PCRE2_MULTILINE + n for PCRE2_NO_AUTO_CAPTURE + s for PCRE2_DOTALL + x for PCRE2_EXTENDED + xx for PCRE2_EXTENDED_MORE ++For example, (?im) sets caseless, multiline matching. It is also possible to +unset these options by preceding the relevant letters with a hyphen, for +example (?-im). The two "extended" options are not independent; unsetting +either one cancels the effects of both of them. + +
+A combined setting and unsetting such as (?im-sx), which sets PCRE2_CASELESS +and PCRE2_MULTILINE while unsetting PCRE2_DOTALL and PCRE2_EXTENDED, is also +permitted. Only one hyphen may appear in the options string. If a letter +appears both before and after the hyphen, the option is unset. An empty options +setting "(?)" is allowed. Needless to say, it has no effect. +
++If the first character following (? is a circumflex, it causes all of the above +options to be unset. Letters may follow the circumflex to cause some options to +be re-instated, but a hyphen may not appear. +
++Some PCRE2-specific options can be changed by the same mechanism using these +pairs or individual letters: +
+ aD for PCRE2_EXTRA_ASCII_BSD + aS for PCRE2_EXTRA_ASCII_BSS + aW for PCRE2_EXTRA_ASCII_BSW + aP for PCRE2_EXTRA_ASCII_POSIX and PCRE2_EXTRA_ASCII_DIGIT + aT for PCRE2_EXTRA_ASCII_DIGIT + r for PCRE2_EXTRA_CASELESS_RESTRICT + J for PCRE2_DUPNAMES + U for PCRE2_UNGREEDY ++However, except for 'r', these are not unset by (?^), which is equivalent to +(?-imnrsx). If 'a' is not followed by any of the upper case letters shown +above, it sets (or unsets) all the ASCII options. + +
+PCRE2_EXTRA_ASCII_DIGIT has no additional effect when PCRE2_EXTRA_ASCII_POSIX +is set, but including it in (?aP) means that (?-aP) suppresses all ASCII +restrictions for POSIX classes. +
++When one of these option changes occurs at top level (that is, not inside group +parentheses), the change applies until a subsequent change, or the end of the +pattern. An option change within a group (see below for a description of +groups) affects only that part of the group that follows it. At the end of the +group these options are reset to the state they were before the group. For +example, +
+ (a(?i)b)c ++matches abc and aBc and no other strings (assuming PCRE2_CASELESS is not set +externally). Any changes made in one alternative do carry on into subsequent +branches within the same group. For example, +
+ (a(?i)b|c) ++matches "ab", "aB", "c", and "C", even though when matching "C" the first +branch is abandoned before the option setting. This is because the effects of +option settings happen at compile time. There would be some very weird +behaviour otherwise. + +
+As a convenient shorthand, if any option settings are required at the start of +a non-capturing group (see the next section), the option letters may +appear between the "?" and the ":". Thus the two patterns +
+ (?i:saturday|sunday) + (?:(?i)saturday|sunday) ++match exactly the same set of strings. + +
+Note: There are other PCRE2-specific options, applying to the whole +pattern, which can be set by the application when the compiling function is +called. In addition, the pattern can contain special leading sequences such as +(*CRLF) to override what the application has set or what has been defaulted. +Details are given in the section entitled +"Newline sequences" +above. There are also the (*UTF) and (*UCP) leading sequences that can be used +to set UTF and Unicode property modes; they are equivalent to setting the +PCRE2_UTF and PCRE2_UCP options, respectively. However, the application can set +the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the +(*UTF) and (*UCP) sequences. +
+
+Groups are delimited by parentheses (round brackets), which can be nested.
+Turning part of a pattern into a group does two things:
+
+
+1. It localizes a set of alternatives. For example, the pattern
+
+ cat(aract|erpillar|) ++matches "cataract", "caterpillar", or "cat". Without the parentheses, it would +match "cataract", "erpillar" or an empty string. +
+Opening parentheses are counted from left to right (starting from 1) to obtain +numbers for capture groups. For example, if the string "the red king" is +matched against the pattern +
+ the ((red|white) (king|queen)) ++the captured substrings are "red king", "red", and "king", and are numbered 1, +2, and 3, respectively. + +
+The fact that plain parentheses fulfil two functions is not always helpful. +There are often times when grouping is required without capturing. If an +opening parenthesis is followed by a question mark and a colon, the group +does not do any capturing, and is not counted when computing the number of any +subsequent capture groups. For example, if the string "the white queen" +is matched against the pattern +
+ the ((?:red|white) (king|queen)) ++the captured substrings are "white queen" and "queen", and are numbered 1 and +2. The maximum number of capture groups is 65535. + +
+As a convenient shorthand, if any option settings are required at the start of +a non-capturing group, the option letters may appear between the "?" and the +":". Thus the two patterns +
+ (?i:saturday|sunday) + (?:(?i)saturday|sunday) ++match exactly the same set of strings. Because alternative branches are tried +from left to right, and options are not reset until the end of the group is +reached, an option setting in one branch does affect subsequent branches, so +the above patterns match "SUNDAY" as well as "Saturday". + +
+Perl 5.10 introduced a feature whereby each alternative in a group uses the +same numbers for its capturing parentheses. Such a group starts with (?| and is +itself a non-capturing group. For example, consider this pattern: +
+ (?|(Sat)ur|(Sun))day ++Because the two alternatives are inside a (?| group, both sets of capturing +parentheses are numbered one. Thus, when the pattern matches, you can look +at captured substring number one, whichever alternative matched. This construct +is useful when you want to capture part, but not all, of one of a number of +alternatives. Inside a (?| group, parentheses are numbered as usual, but the +number is reset at the start of each branch. The numbers of any capturing +parentheses that follow the whole group start after the highest number used in +any branch. The following example is taken from the Perl documentation. The +numbers underneath show in which buffer the captured content will be stored. +
+ # before ---------------branch-reset----------- after + / ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x + # 1 2 2 3 2 3 4 ++A backreference to a capture group uses the most recent value that is set for +the group. The following pattern matches "abcabc" or "defdef": +
+ /(?|(abc)|(def))\1/ ++In contrast, a subroutine call to a capture group always refers to the +first one in the pattern with the given number. The following pattern matches +"abcabc" or "defabc": +
+ /(?|(abc)|(def))(?1)/ ++A relative reference such as (?-1) is no different: it is just a convenient way +of computing an absolute group number. + +
+If a +condition test +for a group's having matched refers to a non-unique number, the test is +true if any group with that number has matched. +
++An alternative approach to using this "branch reset" feature is to use +duplicate named groups, as described in the next section. +
++Identifying capture groups by number is simple, but it can be very hard to keep +track of the numbers in complicated patterns. Furthermore, if an expression is +modified, the numbers may change. To help with this difficulty, PCRE2 supports +the naming of capture groups. This feature was not added to Perl until release +5.10. Python had the feature earlier, and PCRE1 introduced it at release 4.0, +using the Python syntax. PCRE2 supports both the Perl and the Python syntax. +
++In PCRE2, a capture group can be named in one of three ways: (?<name>...) or +(?'name'...) as in Perl, or (?P<name>...) as in Python. Names may be up to 128 +code units long. When PCRE2_UTF is not set, they may contain only ASCII +alphanumeric characters and underscores, but must start with a non-digit. When +PCRE2_UTF is set, the syntax of group names is extended to allow any Unicode +letter or Unicode decimal digit. In other words, group names must match one of +these patterns: +
+ ^[_A-Za-z][_A-Za-z0-9]*\z when PCRE2_UTF is not set
+ ^[_\p{L}][_\p{L}\p{Nd}]*\z when PCRE2_UTF is set
+
+References to capture groups from other parts of the pattern, such as
+backreferences,
+recursion,
+and
+conditions,
+can all be made by name as well as by number.
+
++Named capture groups are allocated numbers as well as names, exactly as +if the names were not present. In both PCRE2 and Perl, capture groups +are primarily identified by numbers; any names are just aliases for these +numbers. The PCRE2 API provides function calls for extracting the complete +name-to-number translation table from a compiled pattern, as well as +convenience functions for extracting captured substrings by name. +
++Warning: When more than one capture group has the same number, as +described in the previous section, a name given to one of them applies to all +of them. Perl allows identically numbered groups to have different names. +Consider this pattern, where there are two capture groups, both numbered 1: +
+ (?|(?<AA>aa)|(?<BB>bb)) ++Perl allows this, with both names AA and BB as aliases of group 1. Thus, after +a successful match, both names yield the same value (either "aa" or "bb"). + +
+In an attempt to reduce confusion, PCRE2 does not allow the same group number +to be associated with more than one name. The example above provokes a +compile-time error. However, there is still scope for confusion. Consider this +pattern: +
+ (?|(?<AA>aa)|(bb)) ++Although the second group number 1 is not explicitly named, the name AA is +still an alias for any group 1. Whether the pattern matches "aa" or "bb", a +reference by name to group AA yields the matched string. + +
+By default, a name must be unique within a pattern, except that duplicate names +are permitted for groups with the same number, for example: +
+ (?|(?<AA>aa)|(?<AA>bb)) ++The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES +option at compile time, or by the use of (?J) within the pattern, as described +in the section entitled +"Internal Option Setting" +above. + +
+Duplicate names can be useful for patterns where only one instance of the named +capture group can match. Suppose you want to match the name of a weekday, +either as a 3-letter abbreviation or as the full name, and in both cases you +want to extract the abbreviation. This pattern (ignoring the line breaks) does +the job: +
+ (?J) + (?<DN>Mon|Fri|Sun)(?:day)?| + (?<DN>Tue)(?:sday)?| + (?<DN>Wed)(?:nesday)?| + (?<DN>Thu)(?:rsday)?| + (?<DN>Sat)(?:urday)? ++There are five capture groups, but only one is ever set after a match. The +convenience functions for extracting the data by name returns the substring for +the first (and in this example, the only) group of that name that matched. This +saves searching to find which numbered group it was. (An alternative way of +solving this problem is to use a "branch reset" group, as described in the +previous section.) + +
+If you make a backreference to a non-unique named group from elsewhere in the +pattern, the groups to which the name refers are checked in the order in which +they appear in the overall pattern. The first one that is set is used for the +reference. For example, this pattern matches both "foofoo" and "barbar" but not +"foobar" or "barfoo": +
+ (?J)(?:(?<n>foo)|(?<n>bar))\k<n> + ++ +
+If you make a subroutine call to a non-unique named group, the one that +corresponds to the first occurrence of the name is used. In the absence of +duplicate numbers this is the one with the lowest number. +
++If you use a named reference in a condition +test (see the +section about conditions +below), either to check whether a capture group has matched, or to check for +recursion, all groups with the same name are tested. If the condition is true +for any one of them, the overall condition is true. This is the same behaviour +as testing by number. For further details of the interfaces for handling named +capture groups, see the +pcre2api +documentation. +
++Repetition is specified by quantifiers, which may follow any one of these +items: +
+ a literal data character + the dot metacharacter + the \C escape sequence + the \R escape sequence + the \X escape sequence + any escape sequence that matches a single character + a character class + a backreference + a parenthesized group (including lookaround assertions) + a subroutine call (recursive or otherwise) ++If a quantifier does not follow a repeatable item, an error occurs. The +general repetition quantifier specifies a minimum and maximum number of +permitted matches by giving two numbers in curly brackets (braces), separated +by a comma. The numbers must be less than 65536, and the first must be less +than or equal to the second. For example, +
+ z{2,4}
+
+matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
+character. If the second number is omitted, but the comma is present, there is
+no upper limit; if the second number and the comma are both omitted, the
+quantifier specifies an exact number of required matches. Thus
+
+ [aeiou]{3,}
+
+matches at least 3 successive vowels, but may match many more, whereas
+
+ \d{8}
+
+matches exactly 8 digits. If the first number is omitted, the lower limit is
+taken as zero; in this case the upper limit must be present.
+
+ X{,4} is interpreted as X{0,4}
+
+This is a change in behaviour that happened in Perl 5.34.0 and PCRE2 10.43. In
+earlier versions such a sequence was not interpreted as a quantifier. Other
+regular expression engines may behave either way.
+
++If the characters that follow an opening brace do not match the syntax of a +quantifier, the brace is taken as a literal character. In particular, this +means that {,} is a literal string of three characters. +
++Note that not every opening brace is potentially the start of a quantifier +because braces are used in other items such as \N{U+345} or \k{name}. +
++In UTF modes, quantifiers apply to characters rather than to individual code +units. Thus, for example, \x{100}{2} matches two characters, each of +which is represented by a two-byte sequence in a UTF-8 string. Similarly, +\X{3} matches three Unicode extended grapheme clusters, each of which may be +several code units long (and they may be of different lengths). +
++The quantifier {0} is permitted, causing the expression to behave as if the +previous item and the quantifier were not present. This may be useful for +capture groups that are referenced as +subroutines +from elsewhere in the pattern (but see also the section entitled +"Defining capture groups for use by reference only" +below). Except for parenthesized groups, items that have a {0} quantifier are +omitted from the compiled pattern. +
++For convenience, the three most common quantifiers have single-character +abbreviations: +
+ * is equivalent to {0,}
+ + is equivalent to {1,}
+ ? is equivalent to {0,1}
+
+It is possible to construct infinite loops by following a group that can match
+no characters with a quantifier that has no upper limit, for example:
++ (a?)* ++Earlier versions of Perl and PCRE1 used to give an error at compile time for +such patterns. However, because there are cases where this can be useful, such +patterns are now accepted, but whenever an iteration of such a group matches no +characters, matching moves on to the next item in the pattern instead of +repeatedly matching an empty string. This does not prevent backtracking into +any of the iterations if a subsequent item fails to match. + +
+By default, quantifiers are "greedy", that is, they match as much as possible +(up to the maximum number of permitted repetitions), without causing the rest +of the pattern to fail. The classic example of where this gives problems is in +trying to match comments in C programs. These appear between /* and */ and +within the comment, individual * and / characters may appear. An attempt to +match C comments by applying the pattern +
+ /\*.*\*/ ++to the string +
+ /* first comment */ not comment /* second comment */ ++fails, because it matches the entire string owing to the greediness of the .* +item. However, if a quantifier is followed by a question mark, it ceases to be +greedy, and instead matches the minimum number of times possible, so the +pattern +
+ /\*.*?\*/ ++does the right thing with C comments. The meaning of the various quantifiers is +not otherwise changed, just the preferred number of matches. Do not confuse +this use of question mark with its use as a quantifier in its own right. +Because it has two uses, it can sometimes appear doubled, as in +
+ \d??\d ++which matches one digit by preference, but can match two if that is the only +way the rest of the pattern matches. + +
+If the PCRE2_UNGREEDY option is set (an option that is not available in Perl), +the quantifiers are not greedy by default, but individual ones can be made +greedy by following them with a question mark. In other words, it inverts the +default behaviour. +
++When a parenthesized group is quantified with a minimum repeat count that +is greater than 1 or with a limited maximum, more memory is required for the +compiled pattern, in proportion to the size of the minimum or maximum. +
++If a pattern starts with .* or .{0,} and the PCRE2_DOTALL option (equivalent +to Perl's /s) is set, thus allowing the dot to match newlines, the pattern is +implicitly anchored, because whatever follows will be tried against every +character position in the subject string, so there is no point in retrying the +overall match at any position after the first. PCRE2 normally treats such a +pattern as though it were preceded by \A. +
++In cases where it is known that the subject string contains no newlines, it is +worth setting PCRE2_DOTALL in order to obtain this optimization, or +alternatively, using ^ to indicate anchoring explicitly. +
++However, there are some cases where the optimization cannot be used. When .* +is inside capturing parentheses that are the subject of a backreference +elsewhere in the pattern, a match at the start may fail where a later one +succeeds. Consider, for example: +
+ (.*)abc\1 ++If the subject is "xyz123abc123" the match point is the fourth character. For +this reason, such a pattern is not implicitly anchored. + +
+Another case where implicit anchoring is not applied is when the leading .* is +inside an atomic group. Once again, a match at the start may fail where a later +one succeeds. Consider this pattern: +
+ (?>.*?a)b ++It matches "ab" in the subject "aab". The use of the backtracking control verbs +(*PRUNE) and (*SKIP) also disable this optimization, and there is an option, +PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. + +
+When a capture group is repeated, the value captured is the substring that +matched the final iteration. For example, after +
+ (tweedle[dume]{3}\s*)+
+
+has matched "tweedledum tweedledee" the value of the captured substring is
+"tweedledee". However, if there are nested capture groups, the corresponding
+captured values may have been set in previous iterations. For example, after
++ (a|(b))+ ++matches "aba" the value of the second captured substring is "b". + +
+With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") +repetition, failure of what follows normally causes the repeated item to be +re-evaluated to see if a different number of repeats allows the rest of the +pattern to match. Sometimes it is useful to prevent this, either to change the +nature of the match, or to cause it fail earlier than it otherwise might, when +the author of the pattern knows there is no point in carrying on. +
++Consider, for example, the pattern \d+foo when applied to the subject line +
+ 123456bar ++After matching all 6 digits and then failing to match "foo", the normal +action of the matcher is to try again with only 5 digits matching the \d+ +item, and then with 4, and so on, before ultimately failing. "Atomic grouping" +(a term taken from Jeffrey Friedl's book) provides the means for specifying +that once a group has matched, it is not to be re-evaluated in this way. + +
+If we use atomic grouping for the previous example, the matcher gives up +immediately on failing to match "foo" the first time. The notation is a kind of +special parenthesis, starting with (?> as in this example: +
+ (?>\d+)foo ++Perl 5.28 introduced an experimental alphabetic form starting with (* which may +be easier to remember: +
+ (*atomic:\d+)foo ++This kind of parenthesized group "locks up" the part of the pattern it contains +once it has matched, and a failure further into the pattern is prevented from +backtracking into it. Backtracking past it to previous items, however, works as +normal. + +
+An alternative description is that a group of this type matches exactly the +string of characters that an identical standalone pattern would match, if +anchored at the current point in the subject string. +
++Atomic groups are not capture groups. Simple cases such as the above example +can be thought of as a maximizing repeat that must swallow everything it can. +So, while both \d+ and \d+? are prepared to adjust the number of digits they +match in order to make the rest of the pattern match, (?>\d+) can only match +an entire sequence of digits. +
++Atomic groups in general can of course contain arbitrarily complicated +expressions, and can be nested. However, when the contents of an atomic +group is just a single repeated item, as in the example above, a simpler +notation, called a "possessive quantifier" can be used. This consists of an +additional + character following a quantifier. Using this notation, the +previous example can be rewritten as +
+ \d++foo ++Note that a possessive quantifier can be used with an entire group, for +example: +
+ (abc|xyz){2,3}+
+
+Possessive quantifiers are always greedy; the setting of the PCRE2_UNGREEDY
+option is ignored. They are a convenient notation for the simpler forms of
+atomic group. However, there is no difference in the meaning of a possessive
+quantifier and the equivalent atomic group, though there may be a performance
+difference; possessive quantifiers should be slightly faster.
+
++The possessive quantifier syntax is an extension to the Perl 5.8 syntax. +Jeffrey Friedl originated the idea (and the name) in the first edition of his +book. Mike McCloskey liked it, so implemented it when he built Sun's Java +package, and PCRE1 copied it from there. It found its way into Perl at release +5.10. +
++PCRE2 has an optimization that automatically "possessifies" certain simple +pattern constructs. For example, the sequence A+B is treated as A++B because +there is no point in backtracking into a sequence of A's when B must follow. +This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting +the pattern with (*NO_AUTO_POSSESS). +
++When a pattern contains an unlimited repeat inside a group that can itself be +repeated an unlimited number of times, the use of an atomic group is the only +way to avoid some failing matches taking a very long time indeed. The pattern +
+ (\D+|<\d+>)*[!?] ++matches an unlimited number of substrings that either consist of non-digits, or +digits enclosed in <>, followed by either ! or ?. When it matches, it runs +quickly. However, if it is applied to +
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ++it takes a long time before reporting failure. This is because the string can +be divided between the internal \D+ repeat and the external * repeat in a +large number of ways, and all have to be tried. (The example uses [!?] rather +than a single character at the end, because both PCRE2 and Perl have an +optimization that allows for fast failure when a single character is used. They +remember the last single character that is required for a match, and fail early +if it is not present in the string.) If the pattern is changed so that it uses +an atomic group, like this: +
+ ((?>\D+)|<\d+>)*[!?] ++sequences of non-digits cannot be broken, and failure happens quickly. + +
+Outside a character class, a backslash followed by a digit greater than 0 (and +possibly further digits) is a backreference to a capture group earlier (that +is, to its left) in the pattern, provided there have been that many previous +capture groups. +
++However, if the decimal number following the backslash is less than 8, it is +always taken as a backreference, and causes an error only if there are not that +many capture groups in the entire pattern. In other words, the group that is +referenced need not be to the left of the reference for numbers less than 8. A +"forward backreference" of this type can make sense when a repetition is +involved and the group to the right has participated in an earlier iteration. +
++It is not possible to have a numerical "forward backreference" to a group whose +number is 8 or more using this syntax because a sequence such as \50 is +interpreted as a character defined in octal. See the subsection entitled +"Non-printing characters" +above +for further details of the handling of digits following a backslash. Other +forms of backreferencing do not suffer from this restriction. In particular, +there is no problem when named capture groups are used (see below). +
++Another way of avoiding the ambiguity inherent in the use of digits following a +backslash is to use the \g escape sequence. This escape must be followed by a +signed or unsigned number, optionally enclosed in braces. These examples are +all identical: +
+ (ring), \1
+ (ring), \g1
+ (ring), \g{1}
+
+An unsigned number specifies an absolute reference without the ambiguity that
+is present in the older syntax. It is also useful when literal digits follow
+the reference. A signed number is a relative reference. Consider this example:
+
+ (abc(def)ghi)\g{-1}
+
+The sequence \g{-1} is a reference to the capture group whose number is one
+less than the number of the next group to be started, so in this example (where
+the next group would be numbered 3) is it equivalent to \2, and \g{-2} would
+be equivalent to \1. Note that if this construct is inside a capture group,
+that group is included in the count, so in this example \g{-2} also refers to
+group 1:
+
+ (A)(\g{-2}B)
+
+The use of relative references can be helpful in long patterns, and also in
+patterns that are created by joining together fragments that contain references
+within themselves.
+
++The sequence \g{+1} is a reference to the next capture group that is started +after this item, and \g{+2} refers to the one after that, and so on. This kind +of forward reference can be useful in patterns that repeat. Perl does not +support the use of + in this way. +
++A backreference matches whatever actually most recently matched the capture +group in the current subject string, rather than anything at all that matches +the group (see +"Groups as subroutines" +below for a way of doing that). So the pattern +
+ (sens|respons)e and \1ibility ++matches "sense and sensibility" and "response and responsibility", but not +"sense and responsibility". If caseful matching is in force at the time of the +backreference, the case of letters is relevant. For example, +
+ ((?i)rah)\s+\1 ++matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original +capture group is matched caselessly. + +
+There are several different ways of writing backreferences to named capture +groups. The .NET syntax is \k{name}, the Python syntax is (?=name), and the +original Perl syntax is \k<name> or \k'name'. All of these are now supported +by both Perl and PCRE2. Perl 5.10's unified backreference syntax, in which \g +can be used for both numeric and named references, is also supported by PCRE2. +We could rewrite the above example in any of the following ways: +
+ (?<p1>(?i)rah)\s+\k<p1>
+ (?'p1'(?i)rah)\s+\k{p1}
+ (?P<p1>(?i)rah)\s+(?P=p1)
+ (?<p1>(?i)rah)\s+\g{p1}
+
+A capture group that is referenced by name may appear in the pattern before or
+after the reference.
+
++There may be more than one backreference to the same group. If a group has not +actually been used in a particular match, backreferences to it always fail by +default. For example, the pattern +
+ (a|(bc))\2 ++always fails if it starts to match "a" rather than "bc". However, if the +PCRE2_MATCH_UNSET_BACKREF option is set at compile time, a backreference to an +unset value matches an empty string. + +
+Because there may be many capture groups in a pattern, all digits following a +backslash are taken as part of a potential backreference number. If the pattern +continues with a digit character, some delimiter must be used to terminate the +backreference. If the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, this +can be white space. Otherwise, the \g{} syntax or an empty comment (see +"Comments" +below) can be used. +
++A backreference that occurs inside the group to which it refers fails when the +group is first used, so, for example, (a\1) never matches. However, such +references can be useful inside repeated groups. For example, the pattern +
+ (a|b\1)+ ++matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of +the group, the backreference matches the character string corresponding to the +previous iteration. In order for this to work, the pattern must be such that +the first iteration does not need to match the backreference. This can be done +using alternation, as in the example above, or by a quantifier with a minimum +of zero. + +
+For versions of PCRE2 less than 10.25, backreferences of this type used to +cause the group that they reference to be treated as an +atomic group. +This restriction no longer applies, and backtracking into such groups can occur +as normal. +
++An assertion is a test on the characters following or preceding the current +matching point that does not consume any characters. The simple assertions +coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described +above. +
++More complicated assertions are coded as parenthesized groups. There are two +kinds: those that look ahead of the current position in the subject string, and +those that look behind it, and in each case an assertion may be positive (must +match for the assertion to be true) or negative (must not match for the +assertion to be true). An assertion group is matched in the normal way, +and if it is true, matching continues after it, but with the matching position +in the subject string reset to what it was before the assertion was processed. +
++The Perl-compatible lookaround assertions are atomic. If an assertion is true, +but there is a subsequent matching failure, there is no backtracking into the +assertion. However, there are some cases where non-atomic assertions can be +useful. PCRE2 has some support for these, described in the section entitled +"Non-atomic assertions" +below, but they are not Perl-compatible. +
++A lookaround assertion may appear as the condition in a +conditional group +(see below). In this case, the result of matching the assertion determines +which branch of the condition is followed. +
++Assertion groups are not capture groups. If an assertion contains capture +groups within it, these are counted for the purposes of numbering the capture +groups in the whole pattern. Within each branch of an assertion, locally +captured substrings may be referenced in the usual way. For example, a sequence +such as (.)\g{-1} can be used to check that two adjacent characters are the +same. +
++When a branch within an assertion fails to match, any substrings that were +captured are discarded (as happens with any pattern branch that fails to +match). A negative assertion is true only when all its branches fail to match; +this means that no captured substrings are ever retained after a successful +negative assertion. When an assertion contains a matching branch, what happens +depends on the type of assertion. +
++For a positive assertion, internally captured substrings in the successful +branch are retained, and matching continues with the next pattern item after +the assertion. For a negative assertion, a matching branch means that the +assertion is not true. If such an assertion is being used as a condition in a +conditional group +(see below), captured substrings are retained, because matching continues with +the "no" branch of the condition. For other failing negative assertions, +control passes to the previous backtracking point, thus discarding any captured +strings within the assertion. +
++Most assertion groups may be repeated; though it makes no sense to assert the +same thing several times, the side effect of capturing in positive assertions +may occasionally be useful. However, an assertion that forms the condition for +a conditional group may not be quantified. PCRE2 used to restrict the +repetition of assertions, but from release 10.35 the only restriction is that +an unlimited maximum repetition is changed to be one more than the minimum. For +example, {3,} is treated as {3,4}. +
++Traditionally, symbolic sequences such as (?= and (?<= have been used to +specify lookaround assertions. Perl 5.28 introduced some experimental +alphabetic alternatives which might be easier to remember. They all start with +(* instead of (? and must be written using lower case letters. PCRE2 supports +the following synonyms: +
+ (*positive_lookahead: or (*pla: is the same as (?= + (*negative_lookahead: or (*nla: is the same as (?! + (*positive_lookbehind: or (*plb: is the same as (?<= + (*negative_lookbehind: or (*nlb: is the same as (?<! ++For example, (*pla:foo) is the same assertion as (?=foo). In the following +sections, the various assertions are described using the original symbolic +forms. + +
+Lookahead assertions start with (?= for positive assertions and (?! for +negative assertions. For example, +
+ \w+(?=;) ++matches a word followed by a semicolon, but does not include the semicolon in +the match, and +
+ foo(?!bar) ++matches any occurrence of "foo" that is not followed by "bar". Note that the +apparently similar pattern +
+ (?!foo)bar ++does not find an occurrence of "bar" that is preceded by something other than +"foo"; it finds any occurrence of "bar" whatsoever, because the assertion +(?!foo) is always true when the next three characters are "bar". A +lookbehind assertion is needed to achieve the other effect. + +
+If you want to force a matching failure at some point in a pattern, the most +convenient way to do it is with (?!) because an empty string always matches, so +an assertion that requires there not to be an empty string must always fail. +The backtracking control verb (*FAIL) or (*F) is a synonym for (?!). +
++Lookbehind assertions start with (?<= for positive assertions and (?<! for +negative assertions. For example, +
+ (?<!foo)bar ++does find an occurrence of "bar" that is not preceded by "foo". The contents of +a lookbehind assertion are restricted such that there must be a known maximum +to the lengths of all the strings it matches. There are two cases: + +
+If every top-level alternative matches a fixed length, for example +
+ (?<=colour|color) ++there is a limit of 65535 characters to the lengths, which do not have to be +the same, as this example demonstrates. This is the only kind of lookbehind +supported by PCRE2 versions earlier than 10.43 and by the alternative matching +function pcre2_dfa_match(). + +
+In PCRE2 10.43 and later, pcre2_match() supports lookbehind assertions in +which one or more top-level alternatives can match more than one string length, +for example +
+ (?<=colou?r) ++The maximum matching length for any branch of the lookbehind is limited to a +value set by the calling program (default 255 characters). Unlimited repetition +(for example \d*) is not supported. In some cases, the escape sequence \K +(see above) +can be used instead of a lookbehind assertion at the start of a pattern to get +round the length limit restriction. + +
+In UTF-8 and UTF-16 modes, PCRE2 does not allow the \C escape (which matches a +single code unit even in a UTF mode) to appear in lookbehind assertions, +because it makes it impossible to calculate the length of the lookbehind. The +\X and \R escapes, which can match different numbers of code units, are never +permitted in lookbehinds. +
++"Subroutine" +calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long +as the called capture group matches a limited-length string. However, +recursion, +that is, a "subroutine" call into a group that is already active, +is not supported. +
++PCRE2 supports backreferences in lookbehinds, but only if certain conditions +are met. The PCRE2_MATCH_UNSET_BACKREF option must not be set, there must be no +use of (?| in the pattern (it creates duplicate group numbers), and if the +backreference is by name, the name must be unique. Of course, the referenced +group must itself match a limited length substring. The following pattern +matches words containing at least two characters that begin and end with the +same character: +
+ \b(\w)\w++(?<=\1) ++ +
+Possessive quantifiers can be used in conjunction with lookbehind assertions to +specify efficient matching at the end of subject strings. Consider a simple +pattern such as +
+ abcd$ ++when applied to a long string that does not match. Because matching proceeds +from left to right, PCRE2 will look for each "a" in the subject and then see if +what follows matches the rest of the pattern. If the pattern is specified as +
+ ^.*abcd$ ++the initial .* matches the entire string at first, but when this fails (because +there is no following "a"), it backtracks to match all but the last character, +then all but the last two characters, and so on. Once again the search for "a" +covers the entire string, from right to left, so we are no better off. However, +if the pattern is written as +
+ ^.*+(?<=abcd) ++there can be no backtracking for the .*+ item because of the possessive +quantifier; it can match only the entire string. The subsequent lookbehind +assertion does a single test on the last four characters. If it fails, the +match fails immediately. For long strings, this approach makes a significant +difference to the processing time. + +
+Several assertions (of any sort) may occur in succession. For example, +
+ (?<=\d{3})(?<!999)foo
+
+matches "foo" preceded by three digits that are not "999". Notice that each of
+the assertions is applied independently at the same point in the subject
+string. First there is a check that the previous three characters are all
+digits, and then there is a check that the same three characters are not "999".
+This pattern does not match "foo" preceded by six characters, the first
+of which are digits and the last three of which are not "999". For example, it
+doesn't match "123abcfoo". A pattern to do that is
+
+ (?<=\d{3}...)(?<!999)foo
+
+This time the first assertion looks at the preceding six characters, checking
+that the first three are digits, and then the second assertion checks that the
+preceding three characters are not "999".
+
++Assertions can be nested in any combination. For example, +
+ (?<=(?<!foo)bar)baz ++matches an occurrence of "baz" that is preceded by "bar" which in turn is not +preceded by "foo", while +
+ (?<=\d{3}(?!999)...)foo
+
+is another pattern that matches "foo" preceded by three digits and any three
+characters that are not "999".
+
++Traditional lookaround assertions are atomic. That is, if an assertion is true, +but there is a subsequent matching failure, there is no backtracking into the +assertion. However, there are some cases where non-atomic positive assertions +can be useful. PCRE2 provides these using the following syntax: +
+ (*non_atomic_positive_lookahead: or (*napla: or (?* + (*non_atomic_positive_lookbehind: or (*naplb: or (?<* ++Consider the problem of finding the right-most word in a string that also +appears earlier in the string, that is, it must appear at least twice in total. +This pattern returns the required result as captured substring 1: +
+ ^(?x)(*napla: .* \b(\w++)) (?> .*? \b\1\b ){2}
+
+For a subject such as "word1 word2 word3 word2 word3 word4" the result is
+"word3". How does it work? At the start, ^(?x) anchors the pattern and sets the
+"x" option, which causes white space (introduced for readability) to be
+ignored. Inside the assertion, the greedy .* at first consumes the entire
+string, but then has to backtrack until the rest of the assertion can match a
+word, which is captured by group 1. In other words, when the assertion first
+succeeds, it captures the right-most word in the string.
+
++The current matching point is then reset to the start of the subject, and the +rest of the pattern match checks for two occurrences of the captured word, +using an ungreedy .*? to scan from the left. If this succeeds, we are done, but +if the last word in the string does not occur twice, this part of the pattern +fails. If a traditional atomic lookahead (?= or (*pla: had been used, the +assertion could not be re-entered, and the whole match would fail. The pattern +would succeed only if the very last word in the subject was found twice. +
++Using a non-atomic lookahead, however, means that when the last word does not +occur twice in the string, the lookahead can backtrack and find the second-last +word, and so on, until either the match succeeds, or all words have been +tested. +
++Two conditions must be met for a non-atomic assertion to be useful: the +contents of one or more capturing groups must change after a backtrack into the +assertion, and there must be a backreference to a changed group later in the +pattern. If this is not the case, the rest of the pattern match fails exactly +as before because nothing has changed, so using a non-atomic assertion just +wastes resources. +
++There is one exception to backtracking into a non-atomic assertion. If an +(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That +is, a subsequent match failure cannot backtrack into the assertion. +
++Non-atomic assertions are not supported by the alternative matching function +pcre2_dfa_match(). They are supported by JIT, but only if they do not +contain any control verbs such as (*ACCEPT). (This may change in future). Note +that assertions that appear as conditions for +conditional groups +(see below) must be atomic. +
++In concept, a script run is a sequence of characters that are all from the same +Unicode script such as Latin or Greek. However, because some scripts are +commonly used together, and because some diacritical and other marks are used +with multiple scripts, it is not that simple. There is a full description of +the rules that PCRE2 uses in the section entitled +"Script Runs" +in the +pcre2unicode +documentation. +
++If part of a pattern is enclosed between (*script_run: or (*sr: and a closing +parenthesis, it fails if the sequence of characters that it matches are not a +script run. After a failure, normal backtracking occurs. Script runs can be +used to detect spoofing attacks using characters that look the same, but are +from different scripts. The string "paypal.com" is an infamous example, where +the letters could be a mixture of Latin and Cyrillic. This pattern ensures that +the matched characters in a sequence of non-spaces that follow white space are +a script run: +
+ \s+(*sr:\S+) ++To be sure that they are all from the Latin script (for example), a lookahead +can be used: +
+ \s+(?=\p{Latin})(*sr:\S+)
+
+This works as long as the first character is expected to be a character in that
+script, and not (for example) punctuation, which is allowed with any script. If
+this is not the case, a more creative lookahead is needed. For example, if
+digits, underscore, and dots are permitted at the start:
+
+ \s+(?=[0-9_.]*\p{Latin})(*sr:\S+)
+
+
+
++In many cases, backtracking into a script run pattern fragment is not +desirable. The script run can employ an atomic group to prevent this. Because +this is a common requirement, a shorthand notation is provided by +(*atomic_script_run: or (*asr: +
+ (*asr:...) is the same as (*sr:(?>...)) ++Note that the atomic group is inside the script run. Putting it outside would +not prevent backtracking into the script run pattern. + +
+Support for script runs is not available if PCRE2 is compiled without Unicode +support. A compile-time error is given if any of the above constructs is +encountered. Script runs are not supported by the alternate matching function, +pcre2_dfa_match() because they use the same mechanism as capturing +parentheses. +
++Warning: The (*ACCEPT) control verb +(see below) +should not be used within a script run group, because it causes an immediate +exit from the group, bypassing the script run checking. +
++It is possible to cause the matching process to obey a pattern fragment +conditionally or to choose between two alternative fragments, depending on +the result of an assertion, or whether a specific capture group has +already been matched. The two possible forms of conditional group are: +
+ (?(condition)yes-pattern) + (?(condition)yes-pattern|no-pattern) ++If the condition is satisfied, the yes-pattern is used; otherwise the +no-pattern (if present) is used. An absent no-pattern is equivalent to an empty +string (it always matches). If there are more than two alternatives in the +group, a compile-time error occurs. Each of the two alternatives may itself +contain nested groups of any form, including conditional groups; the +restriction to two alternatives applies only at the level of the condition +itself. This pattern fragment is an example where the alternatives are complex: +
+ (?(1) (A|B|C) | (D | (?(2)E|F) | E) ) + ++ +
+There are five kinds of condition: references to capture groups, references to +recursion, two pseudo-conditions called DEFINE and VERSION, and assertions. +
++If the text between the parentheses consists of a sequence of digits, the +condition is true if a capture group of that number has previously matched. If +there is more than one capture group with the same number (see the earlier +section about duplicate group numbers), +the condition is true if any of them have matched. An alternative notation, +which is a PCRE2 extension, not supported by Perl, is to precede the digits +with a plus or minus sign. In this case, the group number is relative rather +than absolute. The most recently opened capture group (which could be enclosing +this condition) can be referenced by (?(-1), the next most recent by (?(-2), +and so on. Inside loops it can also make sense to refer to subsequent groups. +The next capture group to be opened can be referenced as (?(+1), and so on. The +value zero in any of these forms is not used; it provokes a compile-time error. +
++Consider the following pattern, which contains non-significant white space to +make it more readable (assume the PCRE2_EXTENDED option) and to divide it into +three parts for ease of discussion: +
+ ( \( )? [^()]+ (?(1) \) ) ++The first part matches an optional opening parenthesis, and if that +character is present, sets it as the first captured substring. The second part +matches one or more characters that are not parentheses. The third part is a +conditional group that tests whether or not the first capture group +matched. If it did, that is, if subject started with an opening parenthesis, +the condition is true, and so the yes-pattern is executed and a closing +parenthesis is required. Otherwise, since no-pattern is not present, the +conditional group matches nothing. In other words, this pattern matches a +sequence of non-parentheses, optionally enclosed in parentheses. + +
+If you were embedding this pattern in a larger one, you could use a relative +reference: +
+ ...other stuff... ( \( )? [^()]+ (?(-1) \) ) ... ++This makes the fragment independent of the parentheses in the larger pattern. + +
+Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used +capture group by name. For compatibility with earlier versions of PCRE1, which +had this facility before Perl, the syntax (?(name)...) is also recognized. +Note, however, that undelimited names consisting of the letter R followed by +digits are ambiguous (see the following section). Rewriting the above example +to use a named group gives this: +
+ (?<OPEN> \( )? [^()]+ (?(<OPEN>) \) ) ++If the name used in a condition of this kind is a duplicate, the test is +applied to all groups of the same name, and is true if any one of them has +matched. + +
+"Recursion" in this sense refers to any subroutine-like call from one part of +the pattern to another, whether or not it is actually recursive. See the +sections entitled +"Recursive patterns" +and +"Groups as subroutines" +below for details of recursion and subroutine calls. +
++If a condition is the string (R), and there is no capture group with the name +R, the condition is true if matching is currently in a recursion or subroutine +call to the whole pattern or any capture group. If digits follow the letter R, +and there is no group with that name, the condition is true if the most recent +call is into a group with the given number, which must exist somewhere in the +overall pattern. This is a contrived example that is equivalent to a+b: +
+ ((?(R1)a+|(?1)b)) ++However, in both cases, if there is a capture group with a matching name, the +condition tests for its being set, as described in the section above, instead +of testing for recursion. For example, creating a group with the name R1 by +adding (?<R1>) to the above pattern completely changes its meaning. + +
+If a name preceded by ampersand follows the letter R, for example: +
+ (?(R&name)...) ++the condition is true if the most recent recursion is into a group of that name +(which must exist within the pattern). + +
+This condition does not check the entire recursion stack. It tests only the +current level. If the name used in a condition of this kind is a duplicate, the +test is applied to all groups of the same name, and is true if any one of +them is the most recent recursion. +
++At "top level", all these recursion test conditions are false. +
++If the condition is the string (DEFINE), the condition is always false, even if +there is a group with the name DEFINE. In this case, there may be only one +alternative in the rest of the conditional group. It is always skipped if +control reaches this point in the pattern; the idea of DEFINE is that it can be +used to define subroutines that can be referenced from elsewhere. (The use of +subroutines +is described below.) For example, a pattern to match an IPv4 address such as +"192.168.23.245" could be written like this (ignore white space and line +breaks): +
+ (?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
+ \b (?&byte) (\.(?&byte)){3} \b
+
+The first part of the pattern is a DEFINE group inside which another group
+named "byte" is defined. This matches an individual component of an IPv4
+address (a number less than 256). When matching takes place, this part of the
+pattern is skipped because DEFINE acts like a false condition. The rest of the
+pattern uses references to the named group to match the four dot-separated
+components of an IPv4 address, insisting on a word boundary at each end.
+
++Programs that link with a PCRE2 library can check the version by calling +pcre2_config() with appropriate arguments. Users of applications that do +not have access to the underlying code cannot do this. A special "condition" +called VERSION exists to allow such users to discover which version of PCRE2 +they are dealing with by using this condition to match a string such as +"yesno". VERSION must be followed either by "=" or ">=" and a version number. +For example: +
+ (?(VERSION>=10.4)yes|no) ++This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or +"no" otherwise. The fractional part of the version number may not contain more +than two digits. + +
+If the condition is not in any of the above formats, it must be a parenthesized +assertion. This may be a positive or negative lookahead or lookbehind +assertion. However, it must be a traditional atomic assertion, not one of the +non-atomic assertions. +
++Consider this pattern, again containing non-significant white space, and with +the two alternatives on the second line: +
+ (?(?=[^a-z]*[a-z])
+ \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
+
+The condition is a positive lookahead assertion that matches an optional
+sequence of non-letters followed by a letter. In other words, it tests for the
+presence of at least one letter in the subject. If a letter is found, the
+subject is matched against the first alternative; otherwise it is matched
+against the second. This pattern matches strings in one of the two forms
+dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
+
++When an assertion that is a condition contains capture groups, any +capturing that occurs in a matching branch is retained afterwards, for both +positive and negative assertions, because matching always continues after the +assertion, whether it succeeds or fails. (Compare non-conditional assertions, +for which captures are retained only for positive assertions that succeed.) +
++There are two ways of including comments in patterns that are processed by +PCRE2. In both cases, the start of the comment must not be in a character +class, nor in the middle of any other sequence of related characters such as +(?: or a group name or number. The characters that make up a comment play +no part in the pattern matching. +
++The sequence (?# marks the start of a comment that continues up to the next +closing parenthesis. Nested parentheses are not permitted. If the +PCRE2_EXTENDED or PCRE2_EXTENDED_MORE option is set, an unescaped # character +also introduces a comment, which in this case continues to immediately after +the next newline character or character sequence in the pattern. Which +characters are interpreted as newlines is controlled by an option passed to the +compiling function or by a special sequence at the start of the pattern, as +described in the section entitled +"Newline conventions" +above. Note that the end of this type of comment is a literal newline sequence +in the pattern; escape sequences that happen to represent a newline do not +count. For example, consider this pattern when PCRE2_EXTENDED is set, and the +default newline convention (a single linefeed character) is in force: +
+ abc #comment \n still comment ++On encountering the # character, pcre2_compile() skips along, looking for +a newline in the pattern. The sequence \n is still literal at this stage, so +it does not terminate the comment. Only an actual character with the code value +0x0a (the default newline) does so. + +
+Consider the problem of matching a string in parentheses, allowing for +unlimited nested parentheses. Without the use of recursion, the best that can +be done is to use a pattern that matches up to some fixed depth of nesting. It +is not possible to handle an arbitrary nesting depth. +
++For some time, Perl has provided a facility that allows regular expressions to +recurse (amongst other things). It does this by interpolating Perl code in the +expression at run time, and the code can refer to the expression itself. A Perl +pattern using code interpolation to solve the parentheses problem can be +created like this: +
+ $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
+
+The (?p{...}) item interpolates Perl code at run time, and in this case refers
+recursively to the pattern in which it appears.
+
++Obviously, PCRE2 cannot support the interpolation of Perl code. Instead, it +supports special syntax for recursion of the entire pattern, and also for +individual capture group recursion. After its introduction in PCRE1 and Python, +this kind of recursion was subsequently introduced into Perl at release 5.10. +
++A special item that consists of (? followed by a number greater than zero and a +closing parenthesis is a recursive subroutine call of the capture group of the +given number, provided that it occurs inside that group. (If not, it is a +non-recursive subroutine +call, which is described in the next section.) The special item (?R) or (?0) is +a recursive call of the entire regular expression. +
++This PCRE2 pattern solves the nested parentheses problem (assume the +PCRE2_EXTENDED option is set so that white space is ignored): +
+ \( ( [^()]++ | (?R) )* \) ++First it matches an opening parenthesis. Then it matches any number of +substrings which can either be a sequence of non-parentheses, or a recursive +match of the pattern itself (that is, a correctly parenthesized substring). +Finally there is a closing parenthesis. Note the use of a possessive quantifier +to avoid backtracking into sequences of non-parentheses. + +
+If this were part of a larger pattern, you would not want to recurse the entire +pattern, so instead you could use this: +
+ ( \( ( [^()]++ | (?1) )* \) ) ++We have put the pattern into parentheses, and caused the recursion to refer to +them instead of the whole pattern. + +
+In a larger pattern, keeping track of parenthesis numbers can be tricky. This +is made easier by the use of relative references. Instead of (?1) in the +pattern above you can write (?-2) to refer to the second most recently opened +parentheses preceding the recursion. In other words, a negative number counts +capturing parentheses leftwards from the point at which it is encountered. +
++Be aware however, that if +duplicate capture group numbers +are in use, relative references refer to the earliest group with the +appropriate number. Consider, for example: +
+ (?|(a)|(b)) (c) (?-2) ++The first two capture groups (a) and (b) are both numbered 1, and group (c) +is number 2. When the reference (?-2) is encountered, the second most recently +opened parentheses has the number 1, but it is the first such group (the (a) +group) to which the recursion refers. This would be the same if an absolute +reference (?1) was used. In other words, relative references are just a +shorthand for computing a group number. + +
+It is also possible to refer to subsequent capture groups, by writing +references such as (?+2). However, these cannot be recursive because the +reference is not inside the parentheses that are referenced. They are always +non-recursive subroutine +calls, as described in the next section. +
++An alternative approach is to use named parentheses. The Perl syntax for this +is (?&name); PCRE1's earlier syntax (?P>name) is also supported. We could +rewrite the above example as follows: +
+ (?<pn> \( ( [^()]++ | (?&pn) )* \) ) ++If there is more than one group with the same name, the earliest one is +used. + +
+The example pattern that we have been looking at contains nested unlimited +repeats, and so the use of a possessive quantifier for matching strings of +non-parentheses is important when applying the pattern to strings that do not +match. For example, when this pattern is applied to +
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() ++it yields "no match" quickly. However, if a possessive quantifier is not used, +the match runs for a very long time indeed because there are so many different +ways the + and * repeats can carve up the subject, and all have to be tested +before failure can be reported. + +
+At the end of a match, the values of capturing parentheses are those from +the outermost level. If you want to obtain intermediate values, a callout +function can be used (see below and the +pcre2callout +documentation). If the pattern above is matched against +
+ (ab(cd)ef) ++the value for the inner capturing parentheses (numbered 2) is "ef", which is +the last value taken on at the top level. If a capture group is not matched at +the top level, its final captured value is unset, even if it was (temporarily) +set at a deeper level during the matching process. + +
+Do not confuse the (?R) item with the condition (R), which tests for recursion. +Consider this pattern, which matches text in angle brackets, allowing for +arbitrary nesting. Only digits are allowed in nested brackets (that is, when +recursing), whereas any characters are permitted at the outer level. +
+ < (?: (?(R) \d++ | [^<>]*+) | (?R)) * > ++In this pattern, (?(R) is the start of a conditional group, with two different +alternatives for the recursive and non-recursive cases. The (?R) item is the +actual recursive call. + +
+Some former differences between PCRE2 and Perl no longer exist. +
++Before release 10.30, recursion processing in PCRE2 differed from Perl in that +a recursive subroutine call was always treated as an atomic group. That is, +once it had matched some of the subject string, it was never re-entered, even +if it contained untried alternatives and there was a subsequent matching +failure. (Historical note: PCRE implemented recursion before Perl did.) +
++Starting with release 10.30, recursive subroutine calls are no longer treated +as atomic. That is, they can be re-entered to try unused alternatives if there +is a matching failure later in the pattern. This is now compatible with the way +Perl works. If you want a subroutine call to be atomic, you must explicitly +enclose it in an atomic group. +
++Supporting backtracking into recursions simplifies certain types of recursive +pattern. For example, this pattern matches palindromic strings: +
+ ^((.)(?1)\2|.?)$ ++The second branch in the group matches a single central character in the +palindrome when there are an odd number of characters, or nothing when there +are an even number of characters, but in order to work it has to be able to try +the second case when the rest of the pattern match fails. If you want to match +typical palindromic phrases, the pattern has to ignore all non-word characters, +which can be done like this: +
+ ^\W*+((.)\W*+(?1)\W*+\2|\W*+.?)\W*+$ ++If run with the PCRE2_CASELESS option, this pattern matches phrases such as "A +man, a plan, a canal: Panama!". Note the use of the possessive quantifier *+ to +avoid backtracking into sequences of non-word characters. Without this, PCRE2 +takes a great deal longer (ten times or more) to match typical phrases, and +Perl takes so long that you think it has gone into a loop. + +
+Another way in which PCRE2 and Perl used to differ in their recursion +processing is in the handling of captured values. Formerly in Perl, when a +group was called recursively or as a subroutine (see the next section), it +had no access to any values that were captured outside the recursion, whereas +in PCRE2 these values can be referenced. Consider this pattern: +
+ ^(.)(\1|a(?2)) ++This pattern matches "bab". The first capturing parentheses match "b", then in +the second group, when the backreference \1 fails to match "b", the second +alternative matches "a" and then recurses. In the recursion, \1 does now match +"b" and so the whole match succeeds. This match used to fail in Perl, but in +later versions (I tried 5.024) it now works. + +
+If the syntax for a recursive group call (either by number or by name) is used +outside the parentheses to which it refers, it operates a bit like a subroutine +in a programming language. More accurately, PCRE2 treats the referenced group +as an independent subpattern which it tries to match at the current matching +position. The called group may be defined before or after the reference. A +numbered reference can be absolute or relative, as in these examples: +
+ (...(absolute)...)...(?2)... + (...(relative)...)...(?-1)... + (...(?+1)...(relative)... ++An earlier example pointed out that the pattern +
+ (sens|respons)e and \1ibility ++matches "sense and sensibility" and "response and responsibility", but not +"sense and responsibility". If instead the pattern +
+ (sens|respons)e and (?1)ibility ++is used, it does match "sense and responsibility" as well as the other two +strings. Another example is given in the discussion of DEFINE above. + +
+Like recursions, subroutine calls used to be treated as atomic, but this +changed at PCRE2 release 10.30, so backtracking into subroutine calls can now +occur. However, any capturing parentheses that are set during the subroutine +call revert to their previous values afterwards. +
++Processing options such as case-independence are fixed when a group is +defined, so if it is used as a subroutine, such options cannot be changed for +different calls. For example, consider this pattern: +
+ (abc)(?i:(?-1)) ++It matches "abcabc". It does not match "abcABC" because the change of +processing option does not affect the called group. + +
+The behaviour of +backtracking control verbs +in groups when called as subroutines is described in the section entitled +"Backtracking verbs in subroutines" +below. +
++For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or +a number enclosed either in angle brackets or single quotes, is an alternative +syntax for calling a group as a subroutine, possibly recursively. Here are two +of the examples used above, rewritten using this syntax: +
+ (?<pn> \( ( (?>[^()]+) | \g<pn> )* \) ) + (sens|respons)e and \g'1'ibility ++PCRE2 supports an extension to Oniguruma: if a number is preceded by a +plus or a minus sign it is taken as a relative reference. For example: +
+ (abc)(?i:\g<-1>) ++Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not +synonymous. The former is a backreference; the latter is a subroutine call. + +
+Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl +code to be obeyed in the middle of matching a regular expression. This makes it +possible, amongst other things, to extract different substrings that match the +same pair of parentheses when there is a repetition. +
++PCRE2 provides a similar feature, but of course it cannot obey arbitrary Perl +code. The feature is called "callout". The caller of PCRE2 provides an external +function by putting its entry point in a match context using the function +pcre2_set_callout(), and then passing that context to pcre2_match() +or pcre2_dfa_match(). If no match context is passed, or if the callout +entry point is set to NULL, callouts are disabled. +
++Within a regular expression, (?C<arg>) indicates a point at which the external +function is to be called. There are two kinds of callout: those with a +numerical argument and those with a string argument. (?C) on its own with no +argument is treated as (?C0). A numerical argument allows the application to +distinguish between different callouts. String arguments were added for release +10.20 to make it possible for script languages that use PCRE2 to embed short +scripts within patterns in a similar way to Perl. +
++During matching, when PCRE2 reaches a callout point, the external function is +called. It is provided with the number or string argument of the callout, the +position in the pattern, and one item of data that is also set in the match +block. The callout function may cause matching to proceed, to backtrack, or to +fail. +
++By default, PCRE2 implements a number of optimizations at matching time, and +one side-effect is that sometimes callouts are skipped. If you need all +possible callouts to happen, you need to set options that disable the relevant +optimizations. More details, including a complete description of the +programming interface to the callout function, are given in the +pcre2callout +documentation. +
++If you just want to have a means of identifying different callout points, put a +number less than 256 after the letter C. For example, this pattern has two +callout points: +
+ (?C1)abc(?C2)def ++If the PCRE2_AUTO_CALLOUT flag is passed to pcre2_compile(), numerical +callouts are automatically installed before each item in the pattern. They are +all numbered 255. If there is a conditional group in the pattern whose +condition is an assertion, an additional callout is inserted just before the +condition. An explicit callout may also be set at this position, as in this +example: +
+ (?(?C9)(?=a)abc|def) ++Note that this applies only to assertion conditions, not to other types of +condition. + +
+A delimited string may be used instead of a number as a callout argument. The +starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is +the same as the start, except for {, where the ending delimiter is }. If the +ending delimiter is needed within the string, it must be doubled. For +example: +
+ (?C'ab ''c'' d')xyz(?C{any text})pqr
+
+The doubling is removed before the string is passed to the callout function.
+
++There are a number of special "Backtracking Control Verbs" (to use Perl's +terminology) that modify the behaviour of backtracking during matching. They +are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form, +and may behave differently depending on whether or not a name argument is +present. The names are not required to be unique within the pattern. +
++By default, for compatibility with Perl, a name is any sequence of characters +that does not include a closing parenthesis. The name is not processed in +any way, and it is not possible to include a closing parenthesis in the name. +This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result +is no longer Perl-compatible. +
++When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names +and only an unescaped closing parenthesis terminates the name. However, the +only backslash items that are permitted are \Q, \E, and sequences such as +\x{100} that define character code points. Character type escapes such as \d +are faulted. +
++A closing parenthesis can be included in a name either as \) or between \Q +and \E. In addition to backslash processing, if the PCRE2_EXTENDED or +PCRE2_EXTENDED_MORE option is also set, unescaped whitespace in verb names is +skipped, and #-comments are recognized, exactly as in the rest of the pattern. +PCRE2_EXTENDED and PCRE2_EXTENDED_MORE do not affect verb names unless +PCRE2_ALT_VERBNAMES is also set. +
++The maximum length of a name is 255 in the 8-bit library and 65535 in the +16-bit and 32-bit libraries. If the name is empty, that is, if the closing +parenthesis immediately follows the colon, the effect is as if the colon were +not there. Any number of these verbs may occur in a pattern. Except for +(*ACCEPT), they may not be quantified. +
++Since these verbs are specifically related to backtracking, most of them can be +used only when the pattern is to be matched using the traditional matching +function, because that uses a backtracking algorithm. With the exception of +(*FAIL), which behaves like a failing negative assertion, the backtracking +control verbs cause an error if encountered by the DFA matching function. +
++The behaviour of these verbs in +repeated groups, +assertions, +and in +capture groups called as subroutines +(whether or not recursively) is documented below. +
++PCRE2 contains some optimizations that are used to speed up matching by running +some checks at the start of each match attempt. For example, it may know the +minimum length of matching subject, or that a particular character must be +present. When one of these optimizations bypasses the running of a match, any +included backtracking verbs will not, of course, be processed. You can suppress +the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option +when calling pcre2_compile(), or by starting the pattern with +(*NO_START_OPT). There is more discussion of this option in the section +entitled +"Compiling a pattern" +in the +pcre2api +documentation. +
++Experiments with Perl suggest that it too has similar optimizations, and like +PCRE2, turning them off can change the result of a match. +
++The following verbs act as soon as they are encountered. +
+ (*ACCEPT) or (*ACCEPT:NAME) ++This verb causes the match to end successfully, skipping the remainder of the +pattern. However, when it is inside a capture group that is called as a +subroutine, only that group is ended successfully. Matching then continues +at the outer level. If (*ACCEPT) in triggered in a positive assertion, the +assertion succeeds; in a negative assertion, the assertion fails. + +
+If (*ACCEPT) is inside capturing parentheses, the data so far is captured. For +example: +
+ A((?:A|B(*ACCEPT)|C)D) ++This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by +the outer parentheses. + +
+(*ACCEPT) is the only backtracking verb that is allowed to be quantified +because an ungreedy quantification with a minimum of zero acts only when a +backtrack happens. Consider, for example, +
+ (A(*ACCEPT)??B)C ++where A, B, and C may be complex expressions. After matching "A", the matcher +processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and +the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT) +(see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means +"succeed on backtrack". + +
+Warning: (*ACCEPT) should not be used within a script run group, because +it causes an immediate exit from the group, bypassing the script run checking. +
+ (*FAIL) or (*FAIL:NAME) ++This verb causes a matching failure, forcing backtracking to occur. It may be +abbreviated to (*F). It is equivalent to (?!) but easier to read. The Perl +documentation notes that it is probably useful only when combined with (?{}) or +(??{}). Those are, of course, Perl features that are not present in PCRE2. The +nearest equivalent is the callout feature, as for example in this pattern: +
+ a+(?C)(*FAIL) ++A match with the string "aaaa" always fails, but the callout is taken before +each backtrack happens (in this example, 10 times). + +
+(*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and +(*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before +the verb acts. +
++There is one verb whose main purpose is to track how a match was arrived at, +though it also has a secondary use in conjunction with advancing the match +starting point (see (*SKIP) below). +
+ (*MARK:NAME) or (*:NAME) ++A name is always required with this verb. For all the other backtracking +control verbs, a NAME argument is optional. + +
+When a match succeeds, the name of the last-encountered mark name on the +matching path is passed back to the caller as described in the section entitled +"Other information about the match" +in the +pcre2api +documentation. This applies to all instances of (*MARK) and other verbs, +including those inside assertions and atomic groups. However, there are +differences in those cases when (*MARK) is used in conjunction with (*SKIP) as +described below. +
++The mark name that was last encountered on the matching path is passed back. A +verb without a NAME argument is ignored for this purpose. Here is an example of +pcre2test output, where the "mark" modifier requests the retrieval and +outputting of (*MARK) data: +
+ re> /X(*MARK:A)Y|X(*MARK:B)Z/mark + data> XY + 0: XY + MK: A + XZ + 0: XZ + MK: B ++The (*MARK) name is tagged with "MK:" in this output, and in this example it +indicates which of the two alternatives matched. This is a more efficient way +of obtaining this information than putting each alternative in its own +capturing parentheses. + +
+If a verb with a name is encountered in a positive assertion that is true, the +name is recorded and passed back if it is the last-encountered. This does not +happen for negative assertions or failing positive assertions. +
++After a partial match or a failed match, the last encountered name in the +entire match process is returned. For example: +
+ re> /X(*MARK:A)Y|X(*MARK:B)Z/mark + data> XP + No match, mark = B ++Note that in this unanchored example the mark is retained from the match +attempt that started at the letter "X" in the subject. Subsequent match +attempts starting at "P" and then with an empty string do not get as far as the +(*MARK) item, but nevertheless do not reset it. + +
+If you are interested in (*MARK) values after failed matches, you should +probably set the PCRE2_NO_START_OPTIMIZE option +(see above) +to ensure that the match is always attempted. +
++The following verbs do nothing when they are encountered. Matching continues +with what follows, but if there is a subsequent match failure, causing a +backtrack to the verb, a failure is forced. That is, backtracking cannot pass +to the left of the verb. However, when one of these verbs appears inside an +atomic group or in a lookaround assertion that is true, its effect is confined +to that group, because once the group has been matched, there is never any +backtracking into it. Backtracking from beyond an assertion or an atomic group +ignores the entire group, and seeks a preceding backtracking point. +
++These verbs differ in exactly what kind of failure occurs when backtracking +reaches them. The behaviour described below is what happens when the verb is +not in a subroutine or an assertion. Subsequent sections cover these special +cases. +
+ (*COMMIT) or (*COMMIT:NAME) ++This verb causes the whole match to fail outright if there is a later matching +failure that causes backtracking to reach it. Even if the pattern is +unanchored, no further attempts to find a match by advancing the starting point +take place. If (*COMMIT) is the only backtracking verb that is encountered, +once it has been passed pcre2_match() is committed to finding a match at +the current starting point, or not at all. For example: +
+ a+(*COMMIT)b ++This matches "xxaab" but not "aacaab". It can be thought of as a kind of +dynamic anchor, or "I've started, so I must finish." + +
+The behaviour of (*COMMIT:NAME) is not the same as (*MARK:NAME)(*COMMIT). It is +like (*MARK:NAME) in that the name is remembered for passing back to the +caller. However, (*SKIP:NAME) searches only for names that are set with +(*MARK), ignoring those set by any of the other backtracking verbs. +
++If there is more than one backtracking verb in a pattern, a different one that +follows (*COMMIT) may be triggered first, so merely passing (*COMMIT) during a +match does not always guarantee that a match must be at this starting point. +
++Note that (*COMMIT) at the start of a pattern is not the same as an anchor, +unless PCRE2's start-of-match optimizations are turned off, as shown in this +output from pcre2test: +
+ re> /(*COMMIT)abc/ + data> xyzabc + 0: abc + data> + re> /(*COMMIT)abc/no_start_optimize + data> xyzabc + No match ++For the first pattern, PCRE2 knows that any match must start with "a", so the +optimization skips along the subject to "a" before applying the pattern to the +first set of data. The match attempt then succeeds. The second pattern disables +the optimization that skips along to the first character. The pattern is now +applied starting at "x", and so the (*COMMIT) causes the match to fail without +trying any other starting points. +
+ (*PRUNE) or (*PRUNE:NAME) ++This verb causes the match to fail at the current starting position in the +subject if there is a later matching failure that causes backtracking to reach +it. If the pattern is unanchored, the normal "bumpalong" advance to the next +starting character then happens. Backtracking can occur as usual to the left of +(*PRUNE), before it is reached, or when matching to the right of (*PRUNE), but +if there is no match to the right, backtracking cannot cross (*PRUNE). In +simple cases, the use of (*PRUNE) is just an alternative to an atomic group or +possessive quantifier, but there are some uses of (*PRUNE) that cannot be +expressed in any other way. In an anchored pattern (*PRUNE) has the same effect +as (*COMMIT). + +
+The behaviour of (*PRUNE:NAME) is not the same as (*MARK:NAME)(*PRUNE). It is +like (*MARK:NAME) in that the name is remembered for passing back to the +caller. However, (*SKIP:NAME) searches only for names set with (*MARK), +ignoring those set by other backtracking verbs. +
+ (*SKIP) ++This verb, when given without a name, is like (*PRUNE), except that if the +pattern is unanchored, the "bumpalong" advance is not to the next character, +but to the position in the subject where (*SKIP) was encountered. (*SKIP) +signifies that whatever text was matched leading up to it cannot be part of a +successful match if there is a later mismatch. Consider: +
+ a+(*SKIP)b ++If the subject is "aaaac...", after the first match attempt fails (starting at +the first character in the string), the starting point skips on to start the +next attempt at "c". Note that a possessive quantifier does not have the same +effect as this example; although it would suppress backtracking during the +first match attempt, the second attempt would start at the second character +instead of skipping on to "c". + +
+If (*SKIP) is used to specify a new starting position that is the same as the +starting position of the current match, or (by being inside a lookbehind) +earlier, the position specified by (*SKIP) is ignored, and instead the normal +"bumpalong" occurs. +
+ (*SKIP:NAME) ++When (*SKIP) has an associated name, its behaviour is modified. When such a +(*SKIP) is triggered, the previous path through the pattern is searched for the +most recent (*MARK) that has the same name. If one is found, the "bumpalong" +advance is to the subject position that corresponds to that (*MARK) instead of +to where (*SKIP) was encountered. If no (*MARK) with a matching name is found, +the (*SKIP) is ignored. + +
+The search for a (*MARK) name uses the normal backtracking mechanism, which +means that it does not see (*MARK) settings that are inside atomic groups or +assertions, because they are never re-entered by backtracking. Compare the +following pcre2test examples: +
+ re> /a(?>(*MARK:X))(*SKIP:X)(*F)|(.)/ + data: abc + 0: a + 1: a + data: + re> /a(?:(*MARK:X))(*SKIP:X)(*F)|(.)/ + data: abc + 0: b + 1: b ++In the first example, the (*MARK) setting is in an atomic group, so it is not +seen when (*SKIP:X) triggers, causing the (*SKIP) to be ignored. This allows +the second branch of the pattern to be tried at the first character position. +In the second example, the (*MARK) setting is not in an atomic group. This +allows (*SKIP:X) to find the (*MARK) when it backtracks, and this causes a new +matching attempt to start at the second character. This time, the (*MARK) is +never seen because "a" does not match "b", so the matcher immediately jumps to +the second branch of the pattern. + +
+Note that (*SKIP:NAME) searches only for names set by (*MARK:NAME). It ignores +names that are set by other backtracking verbs. +
+ (*THEN) or (*THEN:NAME) ++This verb causes a skip to the next innermost alternative when backtracking +reaches it. That is, it cancels any further backtracking within the current +alternative. Its name comes from the observation that it can be used for a +pattern-based if-then-else block: +
+ ( COND1 (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) ... ++If the COND1 pattern matches, FOO is tried (and possibly further items after +the end of the group if FOO succeeds); on failure, the matcher skips to the +second alternative and tries COND2, without backtracking into COND1. If that +succeeds and BAR fails, COND3 is tried. If subsequently BAZ fails, there are no +more alternatives, so there is a backtrack to whatever came before the entire +group. If (*THEN) is not inside an alternation, it acts like (*PRUNE). + +
+The behaviour of (*THEN:NAME) is not the same as (*MARK:NAME)(*THEN). It is +like (*MARK:NAME) in that the name is remembered for passing back to the +caller. However, (*SKIP:NAME) searches only for names set with (*MARK), +ignoring those set by other backtracking verbs. +
++A group that does not contain a | character is just a part of the enclosing +alternative; it is not a nested alternation with only one alternative. The +effect of (*THEN) extends beyond such a group to the enclosing alternative. +Consider this pattern, where A, B, etc. are complex pattern fragments that do +not contain any | characters at this level: +
+ A (B(*THEN)C) | D ++If A and B are matched, but there is a failure in C, matching does not +backtrack into A; instead it moves to the next alternative, that is, D. +However, if the group containing (*THEN) is given an alternative, it +behaves differently: +
+ A (B(*THEN)C | (*FAIL)) | D ++The effect of (*THEN) is now confined to the inner group. After a failure in C, +matching moves to (*FAIL), which causes the whole group to fail because there +are no more alternatives to try. In this case, matching does backtrack into A. + +
+Note that a conditional group is not considered as having two alternatives, +because only one is ever used. In other words, the | character in a conditional +group has a different meaning. Ignoring white space, consider: +
+ ^.*? (?(?=a) a | b(*THEN)c ) ++If the subject is "ba", this pattern does not match. Because .*? is ungreedy, +it initially matches zero characters. The condition (?=a) then fails, the +character "b" is matched, but "c" is not. At this point, matching does not +backtrack to .*? as might perhaps be expected from the presence of the | +character. The conditional group is part of the single alternative that +comprises the whole pattern, and so the match fails. (If there was a backtrack +into .*?, allowing it to match "b", the match would succeed.) + +
+The verbs just described provide four different "strengths" of control when +subsequent matching fails. (*THEN) is the weakest, carrying on the match at the +next alternative. (*PRUNE) comes next, failing the match at the current +starting position, but allowing an advance to the next character (for an +unanchored pattern). (*SKIP) is similar, except that the advance may be more +than one character. (*COMMIT) is the strongest, causing the entire match to +fail. +
++If more than one backtracking verb is present in a pattern, the one that is +backtracked onto first acts. For example, consider this pattern, where A, B, +etc. are complex pattern fragments: +
+ (A(*COMMIT)B(*THEN)C|ABD) ++If A matches but B fails, the backtrack to (*COMMIT) causes the entire match to +fail. However, if A and B match, but C fails, the backtrack to (*THEN) causes +the next alternative (ABD) to be tried. This behaviour is consistent, but is +not always the same as Perl's. It means that if two or more backtracking verbs +appear in succession, all but the last of them has no effect. Consider this +example: +
+ ...(*COMMIT)(*PRUNE)... ++If there is a matching failure to the right, backtracking onto (*PRUNE) causes +it to be triggered, and its action is taken. There can never be a backtrack +onto (*COMMIT). + +
+PCRE2 sometimes differs from Perl in its handling of backtracking verbs in +repeated groups. For example, consider: +
+ /(a(*COMMIT)b)+ac/ ++If the subject is "abac", Perl matches unless its optimizations are disabled, +but PCRE2 always fails because the (*COMMIT) in the second repeat of the group +acts. + +
+(*FAIL) in any assertion has its normal effect: it forces an immediate +backtrack. The behaviour of the other backtracking verbs depends on whether or +not the assertion is standalone or acting as the condition in a conditional +group. +
++(*ACCEPT) in a standalone positive assertion causes the assertion to succeed +without any further processing; captured strings and a mark name (if set) are +retained. In a standalone negative assertion, (*ACCEPT) causes the assertion to +fail without any further processing; captured substrings and any mark name are +discarded. +
++If the assertion is a condition, (*ACCEPT) causes the condition to be true for +a positive assertion and false for a negative one; captured substrings are +retained in both cases. +
++The remaining verbs act only when a later failure causes a backtrack to +reach them. This means that, for the Perl-compatible assertions, their effect +is confined to the assertion, because Perl lookaround assertions are atomic. A +backtrack that occurs after such an assertion is complete does not jump back +into the assertion. Note in particular that a (*MARK) name that is set in an +assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern. +
++PCRE2 now supports non-atomic positive assertions, as described in the section +entitled +"Non-atomic assertions" +above. These assertions must be standalone (not used as conditions). They are +not Perl-compatible. For these assertions, a later backtrack does jump back +into the assertion, and therefore verbs such as (*COMMIT) can be triggered by +backtracks from later in the pattern. +
++The effect of (*THEN) is not allowed to escape beyond an assertion. If there +are no more branches to try, (*THEN) causes a positive assertion to be false, +and a negative assertion to be true. +
++The other backtracking verbs are not treated specially if they appear in a +standalone positive assertion. In a conditional positive assertion, +backtracking (from within the assertion) into (*COMMIT), (*SKIP), or (*PRUNE) +causes the condition to be false. However, for both standalone and conditional +negative assertions, backtracking into (*COMMIT), (*SKIP), or (*PRUNE) causes +the assertion to be true, without considering any further alternative branches. +
++These behaviours occur whether or not the group is called recursively. +
++(*ACCEPT) in a group called as a subroutine causes the subroutine match to +succeed without any further processing. Matching then continues after the +subroutine call. Perl documents this behaviour. Perl's treatment of the other +verbs in subroutines is different in some cases. +
++(*FAIL) in a group called as a subroutine has its normal effect: it forces +an immediate backtrack. +
++(*COMMIT), (*SKIP), and (*PRUNE) cause the subroutine match to fail when +triggered by being backtracked to in a group called as a subroutine. There is +then a backtrack at the outer level. +
++(*THEN), when triggered, skips to the next alternative in the innermost +enclosing group that has alternatives (its normal behaviour). However, if there +is no such group within the subroutine's group, the subroutine match fails and +there is a backtrack at the outer level. +
++pcre2api(3), pcre2callout(3), pcre2matching(3), +pcre2syntax(3), pcre2(3). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 04 June 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2perform.html b/doc/html/pcre2perform.html new file mode 100644 index 0000000..55fdf20 --- /dev/null +++ b/doc/html/pcre2perform.html @@ -0,0 +1,280 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+Two aspects of performance are discussed below: memory usage and processing +time. The way you express your pattern as a regular expression can affect both +of them. +
++Patterns are compiled by PCRE2 into a reasonably efficient interpretive code, +so that most simple patterns do not use much memory for storing the compiled +version. However, there is one case where the memory usage of a compiled +pattern can be unexpectedly large. If a parenthesized group has a quantifier +with a minimum greater than 1 and/or a limited maximum, the whole group is +repeated in the compiled code. For example, the pattern +
+ (abc|def){2,4}
+
+is compiled as if it were
++ (abc|def)(abc|def)((abc|def)(abc|def)?)? ++(Technical aside: It is done this way so that backtrack points within each of +the repetitions can be independently maintained.) + +
+For regular expressions whose quantifiers use only small numbers, this is not +usually a problem. However, if the numbers are large, and particularly if such +repetitions are nested, the memory usage can become an embarrassment. For +example, the very simple pattern +
+ ((ab){1,1000}c){1,3}
+
+uses over 50KiB when compiled using the 8-bit library. When PCRE2 is
+compiled with its default internal pointer size of two bytes, the size limit on
+a compiled pattern is 65535 code units in the 8-bit and 16-bit libraries, and
+this is reached with the above pattern if the outer repetition is increased
+from 3 to 4. PCRE2 can be compiled to use larger internal pointers and thus
+handle larger compiled patterns, but it is better to try to rewrite your
+pattern to use less memory if you can.
+
++One way of reducing the memory usage for such patterns is to make use of +PCRE2's +"subroutine" +facility. Re-writing the above pattern as +
+ ((ab)(?2){0,999}c)(?1){0,2}
+
+reduces the memory requirements to around 16KiB, and indeed it remains under
+20KiB even with the outer repetition increased to 100. However, this kind of
+pattern is not always exactly equivalent, because any captures within
+subroutine calls are lost when the subroutine completes. If this is not a
+problem, this kind of rewriting will allow you to process patterns that PCRE2
+cannot otherwise handle. The matching performance of the two different versions
+of the pattern are roughly the same. (This applies from release 10.30 - things
+were different in earlier releases.)
+
++From release 10.30, the interpretive (non-JIT) version of pcre2_match() +uses very little system stack at run time. In earlier releases recursive +function calls could use a great deal of stack, and this could cause problems, +but this usage has been eliminated. Backtracking positions are now explicitly +remembered in memory frames controlled by the code. +
++The size of each frame depends on the size of pointer variables and the number +of capturing parenthesized groups in the pattern being matched. On a 64-bit +system the frame size for a pattern with no captures is 128 bytes. For each +capturing group the size increases by 16 bytes. +
++Until release 10.41, an initial 20KiB frames vector was allocated on the system +stack, but this still caused some issues for multi-thread applications where +each thread has a very small stack. From release 10.41 backtracking memory +frames are always held in heap memory. An initial heap allocation is obtained +the first time any match data block is passed to pcre2_match(). This is +remembered with the match data block and re-used if that block is used for +another match. It is freed when the match data block itself is freed. +
++The size of the initial block is the larger of 20KiB or ten times the pattern's +frame size, unless the heap limit is less than this, in which case the heap +limit is used. If the initial block proves to be too small during matching, it +is replaced by a larger block, subject to the heap limit. The heap limit is +checked only when a new block is to be allocated. Reducing the heap limit +between calls to pcre2_match() with the same match data block does not +affect the saved block. +
++In contrast to pcre2_match(), pcre2_dfa_match() does use recursive +function calls, but only for processing atomic groups, lookaround assertions, +and recursion within the pattern. The original version of the code used to +allocate quite large internal workspace vectors on the stack, which caused some +problems for some patterns in environments with small stacks. From release +10.32 the code for pcre2_dfa_match() has been re-factored to use heap +memory when necessary for internal workspace when recursing, though recursive +function calls are still used. +
++The "match depth" parameter can be used to limit the depth of function +recursion, and the "match heap" parameter to limit heap memory in +pcre2_dfa_match(). +
++Certain items in regular expression patterns are processed more efficiently +than others. It is more efficient to use a character class like [aeiou] than a +set of single-character alternatives such as (a|e|i|o|u). In general, the +simplest construction that provides the required behaviour is usually the most +efficient. Jeffrey Friedl's book contains a lot of useful general discussion +about optimizing regular expressions for efficient performance. This document +contains a few observations about PCRE2. +
++Using Unicode character properties (the \p, \P, and \X escapes) is slow, +because PCRE2 has to use a multi-stage table lookup whenever it needs a +character's property. If you can find an alternative pattern that does not use +character properties, it will probably be faster. +
++By default, the escape sequences \b, \d, \s, and \w, and the POSIX +character classes such as [:alpha:] do not use Unicode properties, partly for +backwards compatibility, and partly for performance reasons. However, you can +set the PCRE2_UCP option or start the pattern with (*UCP) if you want Unicode +character properties to be used. This can double the matching time for items +such as \d, when matched with pcre2_match(); the performance loss is +less with a DFA matching function, and in both cases there is not much +difference for \b. +
++When a pattern begins with .* not in atomic parentheses, nor in parentheses +that are the subject of a backreference, and the PCRE2_DOTALL option is set, +the pattern is implicitly anchored by PCRE2, since it can match only at the +start of a subject string. If the pattern has multiple top-level branches, they +must all be anchorable. The optimization can be disabled by the +PCRE2_NO_DOTSTAR_ANCHOR option, and is automatically disabled if the pattern +contains (*PRUNE) or (*SKIP). +
++If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, because the +dot metacharacter does not then match a newline, and if the subject string +contains newlines, the pattern may match from the character immediately +following one of them instead of from the very start. For example, the pattern +
+ .*second ++matches the subject "first\nand second" (where \n stands for a newline +character), with the match starting at the seventh character. In order to do +this, PCRE2 has to retry the match starting after every newline in the subject. + +
+If you are using such a pattern with subject strings that do not contain +newlines, the best performance is obtained by setting PCRE2_DOTALL, or starting +the pattern with ^.* or ^.*? to indicate explicit anchoring. That saves PCRE2 +from having to scan along the subject looking for a newline to restart at. +
++Beware of patterns that contain nested indefinite repeats. These can take a +long time to run when applied to a string that does not match. Consider the +pattern fragment +
+ ^(a+)* ++This can match "aaaa" in 16 different ways, and this number increases very +rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4 +times, and for each of those cases other than 0 or 4, the + repeats can match +different numbers of times.) When the remainder of the pattern is such that the +entire match is going to fail, PCRE2 has in principle to try every possible +variation, and this can take an extremely long time, even for relatively short +strings. + +
+An optimization catches some of the more simple cases such as +
+ (a+)*b ++where a literal character follows. Before embarking on the standard matching +procedure, PCRE2 checks that there is a "b" later in the subject string, and if +there is not, it fails the match immediately. However, when there is no +following literal this optimization cannot be used. You can see the difference +by comparing the behaviour of +
+ (a+)*\d ++with the pattern above. The former gives a failure almost instantly when +applied to a whole line of "a" characters, whereas the latter takes an +appreciable time with strings longer than about 20 characters. + +
+In many cases, the solution to this kind of performance issue is to use an +atomic group or a possessive quantifier. This can often reduce memory +requirements as well. As another example, consider this pattern: +
+ ([^<]|<(?!inet))+ ++It matches from wherever it starts until it encounters "<inet" or the end of +the data, and is the kind of pattern that might be used when processing an XML +file. Each iteration of the outer parentheses matches either one character that +is not "<" or a "<" that is not followed by "inet". However, each time a +parenthesis is processed, a backtracking position is passed, so this +formulation uses a memory frame for each matched character. For a long string, +a lot of memory is required. Consider now this rewritten pattern, which matches +exactly the same strings: +
+ ([^<]++|<(?!inet))+ ++This runs much faster, because sequences of characters that do not contain "<" +are "swallowed" in one item inside the parentheses, and a possessive quantifier +is used to stop any backtracking into the runs of non-"<" characters. This +version also uses a lot less memory because entry to a new set of parentheses +happens only when a "<" character that is not followed by "inet" is encountered +(and we assume this is relatively rare). + +
+This example shows that one way of optimizing performance when matching long +subject strings is to write repeated parenthesized subpatterns to match more +than one character whenever possible. +
++You can set limits on the amount of processing that takes place when matching, +and on the amount of heap memory that is used. The default values of the limits +are very large, and unlikely ever to operate. They can be changed when PCRE2 is +built, and they can also be set when pcre2_match() or +pcre2_dfa_match() is called. For details of these interfaces, see the +pcre2build +documentation and the section entitled +"The match context" +in the +pcre2api +documentation. +
++The pcre2test test program has a modifier called "find_limits" which, if +applied to a subject line, causes it to find the smallest limits that allow a +pattern to match. This is done by repeatedly matching with different limits. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 27 July 2022
+
+Copyright © 1997-2022 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2posix.html b/doc/html/pcre2posix.html new file mode 100644 index 0000000..6e7abd9 --- /dev/null +++ b/doc/html/pcre2posix.html @@ -0,0 +1,379 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+#include <pcre2posix.h> +
+
+int pcre2_regcomp(regex_t *preg, const char *pattern,
+ int cflags);
+
+
+int pcre2_regexec(const regex_t *preg, const char *string,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+
+size_t pcre2_regerror(int errcode, const regex_t *preg,
+ char *errbuf, size_t errbuf_size);
+
+
+void pcre2_regfree(regex_t *preg);
+
+This set of functions provides a POSIX-style API for the PCRE2 regular +expression 8-bit library. There are no POSIX-style wrappers for PCRE2's 16-bit +and 32-bit libraries. See the +pcre2api +documentation for a description of PCRE2's native API, which contains much +additional functionality. +
++IMPORTANT NOTE: The functions described here are NOT thread-safe, and +should not be used in multi-threaded applications. They are also limited to +processing subjects that are not bigger than 2GB. Use the native API instead. +
++These functions are wrapper functions that ultimately call the PCRE2 native +API. Their prototypes are defined in the pcre2posix.h header file, and +they all have unique names starting with pcre2_. However, the +pcre2posix.h header also contains macro definitions that convert the +standard POSIX names such regcomp() into pcre2_regcomp() etc. This +means that a program can use the usual POSIX names without running the risk of +accidentally linking with POSIX functions from a different library. +
++On Unix-like systems the PCRE2 POSIX library is called libpcre2-posix, so +can be accessed by adding -lpcre2-posix to the command for linking an +application. Because the POSIX functions call the native ones, it is also +necessary to add -lpcre2-8. +
++On Windows systems, if you are linking to a DLL version of the library, it is +recommended that PCRE2POSIX_SHARED is defined before including the +pcre2posix.h header, as it will allow for a more efficient way to +invoke the functions by adding the __declspec(dllimport) decorator. +
++Although they were not defined as prototypes in pcre2posix.h, releases +10.33 to 10.36 of the library contained functions with the POSIX names +regcomp() etc. These simply passed their arguments to the PCRE2 +functions. These functions were provided for backwards compatibility with +earlier versions of PCRE2, which had only POSIX names. However, this has proved +troublesome in situations where a program links with several libraries, some of +which use PCRE2's POSIX interface while others use the real POSIX functions. +For this reason, the POSIX names have been removed since release 10.37. +
++Calling the header file pcre2posix.h avoids any conflict with other POSIX +libraries. It can, of course, be renamed or aliased as regex.h, which is +the "correct" name, if there is no clash. It provides two structure types, +regex_t for compiled internal forms, and regmatch_t for returning +captured substrings. It also defines some constants whose names start with +"REG_"; these are used for setting options and identifying error codes. +
++Note that these functions are just POSIX-style wrappers for PCRE2's native API. +They do not give POSIX regular expression behaviour, and they are not +thread-safe or even POSIX compatible. +
++Those POSIX option bits that can reasonably be mapped to PCRE2 native options +have been implemented. In addition, the option REG_EXTENDED is defined with the +value zero. This has no effect, but since programs that are written to the +POSIX interface often use it, this makes it easier to slot in PCRE2 as a +replacement library. Other POSIX options are not even defined. +
++There are also some options that are not defined by POSIX. These have been +added at the request of users who want to make use of certain PCRE2-specific +features via the POSIX calling interface or to add BSD or GNU functionality. +
++When PCRE2 is called via these functions, it is only the API that is POSIX-like +in style. The syntax and semantics of the regular expressions themselves are +still those of Perl, subject to the setting of various PCRE2 options, as +described below. "POSIX-like in style" means that the API approximates to the +POSIX definition; it is not fully POSIX-compatible, and in multi-unit encoding +domains it is probably even less compatible. +
++The descriptions below use the actual names of the functions, but, as described +above, the standard POSIX names (without the pcre2_ prefix) may also be +used. +
++The function pcre2_regcomp() is called to compile a pattern into an +internal form. By default, the pattern is a C string terminated by a binary +zero (but see REG_PEND below). The preg argument is a pointer to a +regex_t structure that is used as a base for storing information about +the compiled regular expression. It is also used for input when REG_PEND is +set. The regex_t structure used by pcre2_regcomp() is defined in +pcre2posix.h and is not the same as the structure used by other libraries +that provide POSIX-style matching. +
++The argument cflags is either zero, or contains one or more of the bits +defined by the following macros: +
+ REG_DOTALL ++The PCRE2_DOTALL option is set when the regular expression is passed for +compilation to the native function. Note that REG_DOTALL is not part of the +POSIX standard. +
+ REG_ICASE ++The PCRE2_CASELESS option is set when the regular expression is passed for +compilation to the native function. +
+ REG_NEWLINE ++The PCRE2_MULTILINE option is set when the regular expression is passed for +compilation to the native function. Note that this does not mimic the +defined POSIX behaviour for REG_NEWLINE (see the following section). +
+ REG_NOSPEC ++The PCRE2_LITERAL option is set when the regular expression is passed for +compilation to the native function. This disables all meta characters in the +pattern, causing it to be treated as a literal string. The only other options +that are allowed with REG_NOSPEC are REG_ICASE, REG_NOSUB, REG_PEND, and +REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard. +
+ REG_NOSUB ++When a pattern that is compiled with this flag is passed to +pcre2_regexec() for matching, the nmatch and pmatch arguments +are ignored, and no captured strings are returned. Versions of the PCRE library +prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this +no longer happens because it disables the use of backreferences. +
+ REG_PEND ++If this option is set, the reg_endp field in the preg structure +(which has the type const char *) must be set to point to the character beyond +the end of the pattern before calling pcre2_regcomp(). The pattern itself +may now contain binary zeros, which are treated as data characters. Without +REG_PEND, a binary zero terminates the pattern and the re_endp field is +ignored. This is a GNU extension to the POSIX standard and should be used with +caution in software intended to be portable to other systems. +
+ REG_UCP ++The PCRE2_UCP option is set when the regular expression is passed for +compilation to the native function. This causes PCRE2 to use Unicode properties +when matching \d, \w, etc., instead of just recognizing ASCII values. Note +that REG_UCP is not part of the POSIX standard. +
+ REG_UNGREEDY ++The PCRE2_UNGREEDY option is set when the regular expression is passed for +compilation to the native function. Note that REG_UNGREEDY is not part of the +POSIX standard. +
+ REG_UTF ++The PCRE2_UTF option is set when the regular expression is passed for +compilation to the native function. This causes the pattern itself and all data +strings used for matching it to be treated as UTF-8 strings. Note that REG_UTF +is not part of the POSIX standard. + +
+In the absence of these flags, no options are passed to the native function. +This means that the regex is compiled with PCRE2 default semantics. In +particular, the way it handles newline characters in the subject string is the +Perl way, not the POSIX way. Note that setting PCRE2_MULTILINE has only +some of the effects specified for REG_NEWLINE. It does not affect the way +newlines are matched by the dot metacharacter (they are not) or by a negative +class such as [^a] (they are). +
++The yield of pcre2_regcomp() is zero on success, and non-zero otherwise. +The preg structure is filled in on success, and one other member of the +structure (as well as re_endp) is public: re_nsub contains the +number of capturing subpatterns in the regular expression. Various error codes +are defined in the header file. +
++NOTE: If the yield of pcre2_regcomp() is non-zero, you must not attempt +to use the contents of the preg structure. If, for example, you pass it +to pcre2_regexec(), the result is undefined and your program is likely to +crash. +
++This area is not simple, because POSIX and Perl take different views of things. +It is not possible to get PCRE2 to obey POSIX semantics, but then PCRE2 was +never intended to be a POSIX engine. The following table lists the different +possibilities for matching newline characters in Perl and PCRE2: +
+ Default Change with + + . matches newline no PCRE2_DOTALL + newline matches [^a] yes not changeable + $ matches \n at end yes PCRE2_DOLLAR_ENDONLY + $ matches \n in middle no PCRE2_MULTILINE + ^ matches \n in middle no PCRE2_MULTILINE ++This is the equivalent table for a POSIX-compatible pattern matcher: +
+ Default Change with + + . matches newline yes REG_NEWLINE + newline matches [^a] yes REG_NEWLINE + $ matches \n at end no REG_NEWLINE + $ matches \n in middle no REG_NEWLINE + ^ matches \n in middle no REG_NEWLINE ++This behaviour is not what happens when PCRE2 is called via its POSIX +API. By default, PCRE2's behaviour is the same as Perl's, except that there is +no equivalent for PCRE2_DOLLAR_ENDONLY in Perl. In both PCRE2 and Perl, there +is no way to stop newline from matching [^a]. + +
+Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and +PCRE2_DOLLAR_ENDONLY when calling pcre2_compile() directly, but there is +no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using +the POSIX API, passing REG_NEWLINE to PCRE2's pcre2_regcomp() function +causes PCRE2_MULTILINE to be passed to pcre2_compile(), and REG_DOTALL +passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY. +
++The function pcre2_regexec() is called to match a compiled pattern +preg against a given string, which is by default terminated by a +zero byte (but see REG_STARTEND below), subject to the options in eflags. +These can be: +
+ REG_NOTBOL ++The PCRE2_NOTBOL option is set when calling the underlying PCRE2 matching +function. +
+ REG_NOTEMPTY ++The PCRE2_NOTEMPTY option is set when calling the underlying PCRE2 matching +function. Note that REG_NOTEMPTY is not part of the POSIX standard. However, +setting this option can give more POSIX-like behaviour in some situations. +
+ REG_NOTEOL ++The PCRE2_NOTEOL option is set when calling the underlying PCRE2 matching +function. +
+ REG_STARTEND ++When this option is set, the subject string starts at string + +pmatch[0].rm_so and ends at string + pmatch[0].rm_eo, which +should point to the first character beyond the string. There may be binary +zeros within the subject string, and indeed, using REG_STARTEND is the only +way to pass a subject string that contains a binary zero. + +
+Whatever the value of pmatch[0].rm_so, the offsets of the matched string +and any captured substrings are still given relative to the start of +string itself. (Before PCRE2 release 10.30 these were given relative to +string + pmatch[0].rm_so, but this differs from other +implementations.) +
++This is a BSD extension, compatible with but not specified by IEEE Standard +1003.2 (POSIX.2), and should be used with caution in software intended to be +portable to other systems. Note that a non-zero rm_so does not imply +REG_NOTBOL; REG_STARTEND affects only the location and length of the string, +not how it is matched. Setting REG_STARTEND and passing pmatch as NULL +are mutually exclusive; the error REG_INVARG is returned. +
++If the pattern was compiled with the REG_NOSUB flag, no data about any matched +strings is returned. The nmatch and pmatch arguments of +pcre2_regexec() are ignored (except possibly as input for REG_STARTEND). +
++The value of nmatch may be zero, and the value pmatch may be NULL +(unless REG_STARTEND is set); in both these cases no data about any matched +strings is returned. +
++Otherwise, the portion of the string that was matched, and also any captured +substrings, are returned via the pmatch argument, which points to an +array of nmatch structures of type regmatch_t, containing the +members rm_so and rm_eo. These contain the byte offset to the first +character of each substring and the offset to the first character after the end +of each substring, respectively. The 0th element of the vector relates to the +entire portion of string that was matched; subsequent elements relate to +the capturing subpatterns of the regular expression. Unused entries in the +array have both structure members set to -1. +
++regmatch_t as well as the regoff_t typedef it uses are defined in +pcre2posix.h and are not warranted to have the same size or layout as other +similarly named types from other libraries that provide POSIX-style matching. +
++A successful match yields a zero return; various error codes are defined in the +header file, of which REG_NOMATCH is the "expected" failure code. +
++The pcre2_regerror() function maps a non-zero errorcode from either +pcre2_regcomp() or pcre2_regexec() to a printable message. If +preg is not NULL, the error should have arisen from the use of that +structure. A message terminated by a binary zero is placed in errbuf. If +the buffer is too short, only the first errbuf_size - 1 characters of the +error message are used. The yield of the function is the size of buffer needed +to hold the whole message, including the terminating zero. This value is +greater than errbuf_size if the message was truncated. +
++Compiling a regular expression causes memory to be allocated and associated +with the preg structure. The function pcre2_regfree() frees all +such memory, after which preg may no longer be used as a compiled +expression. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 19 January 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2sample.html b/doc/html/pcre2sample.html new file mode 100644 index 0000000..345df03 --- /dev/null +++ b/doc/html/pcre2sample.html @@ -0,0 +1,110 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+PCRE2 SAMPLE PROGRAM
+
+
+A simple, complete demonstration program to get you started with using PCRE2 is +supplied in the file pcre2demo.c in the src directory in the PCRE2 +distribution. A listing of this program is given in the +pcre2demo +documentation. If you do not have a copy of the PCRE2 distribution, you can +save this listing to re-create the contents of pcre2demo.c. +
++The demonstration program compiles the regular expression that is its +first argument, and matches it against the subject string in its second +argument. No PCRE2 options are set, and default character tables are used. If +matching succeeds, the program outputs the portion of the subject that matched, +together with the contents of any captured substrings. +
++If the -g option is given on the command line, the program then goes on to +check for further matches of the same regular expression in the same subject +string. The logic is a little bit tricky because of the possibility of matching +an empty string. Comments in the code explain what is going on. +
++The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit +library. It handles strings and characters that are stored in 8-bit code units. +By default, one character corresponds to one code unit, but if the pattern +starts with "(*UTF)", both it and the subject are treated as UTF-8 strings, +where characters may occupy multiple code units. +
++If PCRE2 is installed in the standard include and library directories for your +operating system, you should be able to compile the demonstration program using +a command like this: +
+ cc -o pcre2demo pcre2demo.c -lpcre2-8 ++If PCRE2 is installed elsewhere, you may need to add additional options to the +command line. For example, on a Unix-like system that has PCRE2 installed in +/usr/local, you can compile the demonstration program using a command +like this: +
+ cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8 ++Once you have built the demonstration program, you can run simple tests like +this: +
+ ./pcre2demo 'cat|dog' 'the cat sat on the mat' + ./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ++Note that there is a much more comprehensive test program, called +pcre2test, +which supports many more facilities for testing regular expressions using all +three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be +installed). The +pcre2demo +program is provided as a relatively simple coding example. + +
+If you try to run +pcre2demo +when PCRE2 is not installed in the standard library directory, you may get an +error like this on some operating systems (e.g. Solaris): +
+ ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory ++This is caused by the way shared library support works on those systems. You +need to add +
+ -R/usr/local/lib ++(for example) to the compile command to get round this problem. + +
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 02 February 2016
+
+Copyright © 1997-2016 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2serialize.html b/doc/html/pcre2serialize.html new file mode 100644 index 0000000..19418a8 --- /dev/null +++ b/doc/html/pcre2serialize.html @@ -0,0 +1,212 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+int32_t pcre2_serialize_decode(pcre2_code **codes,
+ int32_t number_of_codes, const uint8_t *bytes,
+ pcre2_general_context *gcontext);
+
+
+int32_t pcre2_serialize_encode(const pcre2_code **codes,
+ int32_t number_of_codes, uint8_t **serialized_bytes,
+ PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
+
+
+void pcre2_serialize_free(uint8_t *bytes);
+
+
+int32_t pcre2_serialize_get_number_of_codes(const uint8_t *bytes);
+
+
+If you are running an application that uses a large number of regular
+expression patterns, it may be useful to store them in a precompiled form
+instead of having to compile them every time the application is run. However,
+if you are using the just-in-time optimization feature, it is not possible to
+save and reload the JIT data, because it is position-dependent. The host on
+which the patterns are reloaded must be running the same version of PCRE2, with
+the same code unit width, and must also have the same endianness, pointer width
+and PCRE2_SIZE type. For example, patterns compiled on a 32-bit system using
+PCRE2's 16-bit library cannot be reloaded on a 64-bit system, nor can they be
+reloaded using the 8-bit library.
+
+Note that "serialization" in PCRE2 does not convert compiled patterns to an +abstract format like Java or .NET serialization. The serialized output is +really just a bytecode dump, which is why it can only be reloaded in the same +environment as the one that created it. Hence the restrictions mentioned above. +Applications that are not statically linked with a fixed version of PCRE2 must +be prepared to recompile patterns from their sources, in order to be immune to +PCRE2 upgrades. +
++The facility for saving and restoring compiled patterns is intended for use +within individual applications. As such, the data supplied to +pcre2_serialize_decode() is expected to be trusted data, not data from +arbitrary external sources. There is only some simple consistency checking, not +complete validation of what is being re-loaded. Corrupted data may cause +undefined results. For example, if the length field of a pattern in the +serialized data is corrupted, the deserializing code may read beyond the end of +the byte stream that is passed to it. +
++Before compiled patterns can be saved they must be serialized, which in PCRE2 +means converting the pattern to a stream of bytes. A single byte stream may +contain any number of compiled patterns, but they must all use the same +character tables. A single copy of the tables is included in the byte stream +(its size is 1088 bytes). For more details of character tables, see the +section on locale support +in the +pcre2api +documentation. +
++The function pcre2_serialize_encode() creates a serialized byte stream +from a list of compiled patterns. Its first two arguments specify the list, +being a pointer to a vector of pointers to compiled patterns, and the length of +the vector. The third and fourth arguments point to variables which are set to +point to the created byte stream and its length, respectively. The final +argument is a pointer to a general context, which can be used to specify custom +memory management functions. If this argument is NULL, malloc() is used +to obtain memory for the byte stream. The yield of the function is the number +of serialized patterns, or one of the following negative error codes: +
+ PCRE2_ERROR_BADDATA the number of patterns is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns + PCRE2_ERROR_NOMEMORY memory allocation failed + PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables + PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL ++PCRE2_ERROR_BADMAGIC means either that a pattern's code has been corrupted, or +that a slot in the vector does not point to a compiled pattern. + +
+Once a set of patterns has been serialized you can save the data in any +appropriate manner. Here is sample code that compiles two patterns and writes +them to a file. It assumes that the variable fd refers to a file that is +open for output. The error checking that should be present in a real +application has been omitted for simplicity. +
+ int errorcode;
+ uint8_t *bytes;
+ PCRE2_SIZE erroroffset;
+ PCRE2_SIZE bytescount;
+ pcre2_code *list_of_codes[2];
+ list_of_codes[0] = pcre2_compile("first pattern",
+ PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL);
+ list_of_codes[1] = pcre2_compile("second pattern",
+ PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroroffset, NULL);
+ errorcode = pcre2_serialize_encode(list_of_codes, 2, &bytes,
+ &bytescount, NULL);
+ errorcode = fwrite(bytes, 1, bytescount, fd);
+
+Note that the serialized data is binary data that may contain any of the 256
+possible byte values. On systems that make a distinction between binary and
+non-binary data, be sure that the file is opened for binary output.
+
++Serializing a set of patterns leaves the original data untouched, so they can +still be used for matching. Their memory must eventually be freed in the usual +way by calling pcre2_code_free(). When you have finished with the byte +stream, it too must be freed by calling pcre2_serialize_free(). If this +function is called with a NULL argument, it returns immediately without doing +anything. +
++In order to re-use a set of saved patterns you must first make the serialized +byte stream available in main memory (for example, by reading from a file). The +management of this memory block is up to the application. You can use the +pcre2_serialize_get_number_of_codes() function to find out how many +compiled patterns are in the serialized data without actually decoding the +patterns: +
+ uint8_t *bytes = <serialized data>; + int32_t number_of_codes = pcre2_serialize_get_number_of_codes(bytes); ++The pcre2_serialize_decode() function reads a byte stream and recreates +the compiled patterns in new memory blocks, setting pointers to them in a +vector. The first two arguments are a pointer to a suitable vector and its +length, and the third argument points to a byte stream. The final argument is a +pointer to a general context, which can be used to specify custom memory +management functions for the decoded patterns. If this argument is NULL, +malloc() and free() are used. After deserialization, the byte +stream is no longer needed and can be discarded. +
+ pcre2_code *list_of_codes[2]; + uint8_t *bytes = <serialized data>; + int32_t number_of_codes = + pcre2_serialize_decode(list_of_codes, 2, bytes, NULL); ++If the vector is not large enough for all the patterns in the byte stream, it +is filled with those that fit, and the remainder are ignored. The yield of the +function is the number of decoded patterns, or one of the following negative +error codes: +
+ PCRE2_ERROR_BADDATA second argument is zero or less + PCRE2_ERROR_BADMAGIC mismatch of id bytes in the data + PCRE2_ERROR_BADMODE mismatch of code unit size or PCRE2 version + PCRE2_ERROR_BADSERIALIZEDDATA other sanity check failure + PCRE2_ERROR_MEMORY memory allocation failed + PCRE2_ERROR_NULL first or third argument is NULL ++PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled +on a system with different endianness. + +
+Decoded patterns can be used for matching in the usual way, and must be freed +by calling pcre2_code_free(). However, be aware that there is a potential +race issue if you are using multiple patterns that were decoded from a single +byte stream in a multithreaded application. A single copy of the character +tables is used by all the decoded patterns and a reference count is used to +arrange for its memory to be automatically freed when the last pattern is +freed, but there is no locking on this reference count. Therefore, if you want +to call pcre2_code_free() for these patterns in different threads, you +must arrange your own locking, and ensure that pcre2_code_free() cannot +be called by two threads at the same time. +
++If a pattern was processed by pcre2_jit_compile() before being +serialized, the JIT data is discarded and so is no longer available after a +save/restore cycle. You can, however, process a restored pattern with +pcre2_jit_compile() if you wish. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 27 June 2018
+
+Copyright © 1997-2018 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html new file mode 100644 index 0000000..1c0ccb0 --- /dev/null +++ b/doc/html/pcre2syntax.html @@ -0,0 +1,635 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+The full syntax and semantics of the regular expressions that are supported by +PCRE2 are described in the +pcre2pattern +documentation. This document contains a quick-reference summary of the syntax. +
++
+ \x where x is non-alphanumeric is a literal x + \Q...\E treat enclosed characters as literal ++Note that white space inside \Q...\E is always treated as literal, even if +PCRE2_EXTENDED is set, causing most other white space to be ignored. + +
+With one exception, wherever brace characters { and } are required to enclose +data for constructions such as \g{2} or \k{name}, space and/or horizontal tab +characters that follow { or precede } are allowed and are ignored. In the case +of quantifiers, they may also appear before or after the comma. The exception +is \u{...} which is not Perl-compatible and is recognized only when +PCRE2_EXTRA_ALT_BSUX is set. This is an ECMAScript compatibility feature, and +follows ECMAScript's behaviour. +
++This table applies to ASCII and Unicode environments. An unrecognized escape +sequence causes an error. +
+ \a alarm, that is, the BEL character (hex 07)
+ \cx "control-x", where x is a non-control ASCII character
+ \e escape (hex 1B)
+ \f form feed (hex 0C)
+ \n newline (hex 0A)
+ \r carriage return (hex 0D)
+ \t tab (hex 09)
+ \0dd character with octal code 0dd
+ \ddd character with octal code ddd, or backreference
+ \o{ddd..} character with octal code ddd..
+ \N{U+hh..} character with Unicode code point hh.. (Unicode mode only)
+ \xhh character with hex code hh
+ \x{hh..} character with hex code hh..
+
+If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the
+following are also recognized:
+
+ \U the character "U"
+ \uhhhh character with hex code hhhh
+ \u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX
+
+When \x is not followed by {, from zero to two hexadecimal digits are read,
+but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be
+recognized as a hexadecimal escape; otherwise it matches a literal "x".
+Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits
+or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it
+matches a literal "u".
+
++Note that \0dd is always an octal code. The treatment of backslash followed by +a non-zero digit is complicated; for details see the section +"Non-printing characters" +in the +pcre2pattern +documentation, where details of escape processing in EBCDIC environments are +also given. \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not +supported in EBCDIC environments. Note that \N not followed by an opening +curly bracket has a different meaning (see below). +
++
+ . any character except newline;
+ in dotall mode, any character whatsoever
+ \C one code unit, even in UTF mode (best avoided)
+ \d a decimal digit
+ \D a character that is not a decimal digit
+ \h a horizontal white space character
+ \H a character that is not a horizontal white space character
+ \N a character that is not a newline
+ \p{xx} a character with the xx property
+ \P{xx} a character without the xx property
+ \R a newline sequence
+ \s a white space character
+ \S a character that is not a white space character
+ \v a vertical white space character
+ \V a character that is not a vertical white space character
+ \w a "word" character
+ \W a "non-word" character
+ \X a Unicode extended grapheme cluster
+
+\C is dangerous because it may leave the current matching point in the middle
+of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
+setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
+with the use of \C permanently disabled.
+
++By default, \d, \s, and \w match only ASCII characters, even in UTF-8 mode +or in the 16-bit and 32-bit libraries. However, if locale-specific matching is +happening, \s and \w may also match characters with code points in the range +128-255. If the PCRE2_UCP option is set, the behaviour of these escape +sequences is changed to use Unicode properties and they match many more +characters, but there are some option settings that can restrict individual +sequences to matching only ASCII characters. +
++Property descriptions in \p and \P are matched caselessly; hyphens, +underscores, and white space are ignored, in accordance with Unicode's "loose +matching" rules. +
++
+ C Other + Cc Control + Cf Format + Cn Unassigned + Co Private use + Cs Surrogate + + L Letter + Ll Lower case letter + Lm Modifier letter + Lo Other letter + Lt Title case letter + Lu Upper case letter + Lc Ll, Lu, or Lt + L& Ll, Lu, or Lt + + M Mark + Mc Spacing mark + Me Enclosing mark + Mn Non-spacing mark + + N Number + Nd Decimal number + Nl Letter number + No Other number + + P Punctuation + Pc Connector punctuation + Pd Dash punctuation + Pe Close punctuation + Pf Final punctuation + Pi Initial punctuation + Po Other punctuation + Ps Open punctuation + + S Symbol + Sc Currency symbol + Sk Modifier symbol + Sm Mathematical symbol + So Other symbol + + Z Separator + Zl Line separator + Zp Paragraph separator + Zs Space separator ++ +
+
+ Xan Alphanumeric: union of properties L and N + Xps POSIX space: property Z or tab, NL, VT, FF, CR + Xsp Perl space: property Z or tab, NL, VT, FF, CR + Xuc Universally-named character: one that can be + represented by a Universal Character Name + Xwd Perl word: property Xan or underscore ++Perl and POSIX space are now the same. Perl added VT to its space character set +at release 5.18. + +
+Unicode defines a number of binary properties, that is, properties whose only +values are true or false. You can obtain a list of those that are recognized by +\p and \P, along with their abbreviations, by running this command: +
+ pcre2test -LP ++ +
+Many script names and their 4-letter abbreviations are recognized in +\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of +course). You can obtain a list of these scripts by running this command: +
+ pcre2test -LS ++ +
+
+ \p{Bidi_Class:<class>} matches a character with the given class
+ \p{BC:<class>} matches a character with the given class
+
+The recognized classes are:
++ AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space ++ +
+
+ [...] positive character class + [^...] negative character class + [x-y] range (can be used for hex characters) + [[:xxx:]] positive POSIX named set + [[:^xxx:]] negative POSIX named set + + alnum alphanumeric + alpha alphabetic + ascii 0-127 + blank space or tab + cntrl control character + digit decimal digit + graph printing, excluding space + lower lower case letter + print printing, including space + punct printing, excluding alphanumeric + space white space + upper upper case letter + word same as \w + xdigit hexadecimal digit ++In PCRE2, POSIX character set names recognize only ASCII characters by default, +but some of them use Unicode properties if PCRE2_UCP is set. You can use +\Q...\E inside a character class. + +
+
+ ? 0 or 1, greedy
+ ?+ 0 or 1, possessive
+ ?? 0 or 1, lazy
+ * 0 or more, greedy
+ *+ 0 or more, possessive
+ *? 0 or more, lazy
+ + 1 or more, greedy
+ ++ 1 or more, possessive
+ +? 1 or more, lazy
+ {n} exactly n
+ {n,m} at least n, no more than m, greedy
+ {n,m}+ at least n, no more than m, possessive
+ {n,m}? at least n, no more than m, lazy
+ {n,} n or more, greedy
+ {n,}+ n or more, possessive
+ {n,}? n or more, lazy
+ {,m} zero up to m, greedy
+ {,m}+ zero up to m, possessive
+ {,m}? zero up to m, lazy
+
+
++
+ \b word boundary + \B not a word boundary + ^ start of subject + also after an internal newline in multiline mode + (after any newline if PCRE2_ALT_CIRCUMFLEX is set) + \A start of subject + $ end of subject + also before newline at end of subject + also before internal newline in multiline mode + \Z end of subject + also before newline at end of subject + \z end of subject + \G first matching position in subject ++ +
+
+ \K set reported start of match ++From release 10.38 \K is not permitted by default in lookaround assertions, +for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK +option is set, the previous behaviour is re-enabled. When this option is set, +\K is honoured in positive assertions, but ignored in negative ones. + +
+
+ expr|expr|expr... ++ +
+
+ (...) capture group + (?<name>...) named capture group (Perl) + (?'name'...) named capture group (Perl) + (?P<name>...) named capture group (Python) + (?:...) non-capture group + (?|...) non-capture group; reset group numbers for + capture groups in each alternative ++In non-UTF modes, names may contain underscores and ASCII letters and digits; +in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In +both cases, a name must not start with a digit. + +
+
+ (?>...) atomic non-capture group + (*atomic:...) atomic non-capture group ++ +
+
+ (?#....) comment (not nestable) ++ +
+Changes of these options within a group are automatically cancelled at the end +of the group. +
+ (?a) all ASCII options + (?aD) restrict \d to ASCII in UCP mode + (?aS) restrict \s to ASCII in UCP mode + (?aW) restrict \w to ASCII in UCP mode + (?aP) restrict all POSIX classes to ASCII in UCP mode + (?aT) restrict POSIX digit classes to ASCII in UCP mode + (?i) caseless + (?J) allow duplicate named groups + (?m) multiline + (?n) no auto capture + (?r) restrict caseless to either ASCII or non-ASCII + (?s) single line (dotall) + (?U) default ungreedy (lazy) + (?x) ignore white space except in classes or \Q...\E + (?xx) as (?x) but also ignore space and tab in classes + (?-...) unset the given option(s) + (?^) unset imnrsx options ++(?aP) implies (?aT) as well, though this has no additional effect. However, it +means that (?-aP) is really (?-PT) which disables all ASCII restrictions for +POSIX classes. + +
+Unsetting x or xx unsets both. Several options may be set at once, and a +mixture of setting and unsetting such as (?i-x) is allowed, but there may be +only one hyphen. Setting (but no unsetting) is allowed after (?^ for example +(?^in). An option setting may appear at the start of a non-capture group, for +example (?i:...). +
++The following are recognized only at the very start of a pattern or after one +of the newline or \R options with similar syntax. More than one of them may +appear. For the first three, d is a decimal number. +
+ (*LIMIT_DEPTH=d) set the backtracking limit to d + (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes + (*LIMIT_MATCH=d) set the match limit to d + (*NOTEMPTY) set PCRE2_NOTEMPTY when matching + (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching + (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) + (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) + (*NO_JIT) disable JIT optimization + (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) + (*UTF) set appropriate UTF mode for the library in use + (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) ++Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of +the limits set by the caller of pcre2_match() or pcre2_dfa_match(), +not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The +application can lock out the use of (*UTF) and (*UCP) by setting the +PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time. + +
+These are recognized only at the very start of the pattern or after option +settings with a similar syntax. +
+ (*CR) carriage return only + (*LF) linefeed only + (*CRLF) carriage return followed by linefeed + (*ANYCRLF) all three of the above + (*ANY) any Unicode newline sequence + (*NUL) the NUL character (binary zero) ++ +
+These are recognized only at the very start of the pattern or after option +setting with a similar syntax. +
+ (*BSR_ANYCRLF) CR, LF, or CRLF + (*BSR_UNICODE) any Unicode newline sequence ++ +
+
+ (?=...) ) + (*pla:...) ) positive lookahead + (*positive_lookahead:...) ) + + (?!...) ) + (*nla:...) ) negative lookahead + (*negative_lookahead:...) ) + + (?<=...) ) + (*plb:...) ) positive lookbehind + (*positive_lookbehind:...) ) + + (?<!...) ) + (*nlb:...) ) negative lookbehind + (*negative_lookbehind:...) ) ++Each top-level branch of a lookbehind must have a limit for the number of +characters it matches. If any branch can match a variable number of characters, +the maximum for each branch is limited to a value set by the caller of +pcre2_compile() or defaulted. The default is set when PCRE2 is built +(ultimate default 255). If every branch matches a fixed number of characters, +the limit for each branch is 65535 characters. + +
+These assertions are specific to PCRE2 and are not Perl-compatible. +
+ (?*...) ) + (*napla:...) ) synonyms + (*non_atomic_positive_lookahead:...) ) + + (?<*...) ) + (*naplb:...) ) synonyms + (*non_atomic_positive_lookbehind:...) ) ++ +
+
+ (*script_run:...) ) script run, can be backtracked into + (*sr:...) ) + + (*atomic_script_run:...) ) atomic script run + (*asr:...) ) ++ +
+
+ \n reference by number (can be ambiguous)
+ \gn reference by number
+ \g{n} reference by number
+ \g+n relative reference by number (PCRE2 extension)
+ \g-n relative reference by number
+ \g{+n} relative reference by number (PCRE2 extension)
+ \g{-n} relative reference by number
+ \k<name> reference by name (Perl)
+ \k'name' reference by name (Perl)
+ \g{name} reference by name (Perl)
+ \k{name} reference by name (.NET)
+ (?P=name) reference by name (Python)
+
+
++
+ (?R) recurse whole pattern + (?n) call subroutine by absolute number + (?+n) call subroutine by relative number + (?-n) call subroutine by relative number + (?&name) call subroutine by name (Perl) + (?P>name) call subroutine by name (Python) + \g<name> call subroutine by name (Oniguruma) + \g'name' call subroutine by name (Oniguruma) + \g<n> call subroutine by absolute number (Oniguruma) + \g'n' call subroutine by absolute number (Oniguruma) + \g<+n> call subroutine by relative number (PCRE2 extension) + \g'+n' call subroutine by relative number (PCRE2 extension) + \g<-n> call subroutine by relative number (PCRE2 extension) + \g'-n' call subroutine by relative number (PCRE2 extension) ++ +
+
+ (?(condition)yes-pattern)
+ (?(condition)yes-pattern|no-pattern)
+
+ (?(n) absolute reference condition
+ (?(+n) relative reference condition (PCRE2 extension)
+ (?(-n) relative reference condition (PCRE2 extension)
+ (?(<name>) named reference condition (Perl)
+ (?('name') named reference condition (Perl)
+ (?(name) named reference condition (PCRE2, deprecated)
+ (?(R) overall recursion condition
+ (?(Rn) specific numbered group recursion condition
+ (?(R&name) specific named group recursion condition
+ (?(DEFINE) define groups for reference
+ (?(VERSION[>]=n.m) test PCRE2 version
+ (?(assert) assertion condition
+
+Note the ambiguity of (?(R) and (?(Rn) which might be named reference
+conditions or recursion tests. Such a condition is interpreted as a reference
+condition if the relevant named group exists.
+
++All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the +name is mandatory, for the others it is optional. (*SKIP) changes its behaviour +if :NAME is present. The others just set a name for passing back to the caller, +but this is not a name that (*SKIP) can see. The following act immediately they +are reached: +
+ (*ACCEPT) force successful match + (*FAIL) force backtrack; synonym (*F) + (*MARK:NAME) set name to be passed back; synonym (*:NAME) ++The following act only when a subsequent match failure causes a backtrack to +reach them. They all force a match failure, but they differ in what happens +afterwards. Those that advance the start-of-match point do so only if the +pattern is not anchored. +
+ (*COMMIT) overall failure, no advance of starting point + (*PRUNE) advance to next starting character + (*SKIP) advance to current matching position + (*SKIP:NAME) advance to position corresponding to an earlier + (*MARK:NAME); if not found, the (*SKIP) is ignored + (*THEN) local failure, backtrack to next alternation ++The effect of one of these verbs in a group called as a subroutine is confined +to the subroutine call. + +
+
+ (?C) callout (assumed number 0) + (?Cn) callout with numerical data n + (?C"text") callout with string data ++The allowed string delimiters are ` ' " ^ % # $ (which are the same for the +start and the end), and the starting delimiter { matched with the ending +delimiter }. To encode the ending delimiter within the string, double it. + +
+pcre2pattern(3), pcre2api(3), pcre2callout(3), +pcre2matching(3), pcre2(3). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 12 October 2023
+
+Copyright © 1997-2023 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html new file mode 100644 index 0000000..6cc3cc3 --- /dev/null +++ b/doc/html/pcre2test.html @@ -0,0 +1,2213 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+pcre2test [options] [input file [output file]]
+
+
+pcre2test is a test program for the PCRE2 regular expression libraries,
+but it can also be used for experimenting with regular expressions. This
+document describes the features of the test program; for details of the regular
+expressions themselves, see the
+pcre2pattern
+documentation. For details of the PCRE2 library function calls and their
+options, see the
+pcre2api
+documentation.
+
+The input for pcre2test is a sequence of regular expression patterns and +subject strings to be matched. There are also command lines for setting +defaults and controlling some special actions. The output shows the result of +each match attempt. Modifiers on external or internal command lines, the +patterns, and the subject lines specify PCRE2 function options, control how the +subject is processed, and what output is produced. +
++There are many obscure modifiers, some of which are specifically designed for +use in conjunction with the test script and data files that are distributed as +part of PCRE2. All the modifiers are documented here, some without much +justification, but many of them are unlikely to be of use except when testing +the libraries. +
++Different versions of the PCRE2 library can be built to support character +strings that are encoded in 8-bit, 16-bit, or 32-bit code units. One, two, or +all three of these libraries may be simultaneously installed. The +pcre2test program can be used to test all the libraries. However, its own +input and output are always in 8-bit format. When testing the 16-bit or 32-bit +libraries, patterns and subject strings are converted to 16-bit or 32-bit +format before being passed to the library functions. Results are converted back +to 8-bit code units for output. +
++In the rest of this document, the names of library functions and structures +are given in generic form, for example, pcre2_compile(). The actual +names used in the libraries have a suffix _8, _16, or _32, as appropriate. +
++Input to pcre2test is processed line by line, either by calling the C +library's fgets() function, or via the libreadline or libedit +library. In some Windows environments character 26 (hex 1A) causes an immediate +end of file, and no further data is read, so this character should be avoided +unless you really want that action. +
++The input is processed using C's string functions, so must not contain binary +zeros, even though in Unix-like environments, fgets() treats any bytes +other than newline as data characters. An error is generated if a binary zero +is encountered. By default subject lines are processed for backslash escapes, +which makes it possible to include any data value in strings that are passed to +the library for matching. For patterns, there is a facility for specifying some +or all of the 8-bit input characters as hexadecimal pairs, which makes it +possible to include binary zeros. +
++When testing the 16-bit or 32-bit libraries, there is a need to be able to +generate character code points greater than 255 in the strings that are passed +to the library. For subject lines, backslash escapes can be used. In addition, +when the utf modifier (see +"Setting compilation options" +below) is set, the pattern and any following subject lines are interpreted as +UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. +
++For non-UTF testing of wide characters, the utf8_input modifier can be +used. This is mutually exclusive with utf, and is allowed only in 16-bit +or 32-bit mode. It causes the pattern and following subject lines to be treated +as UTF-8 according to the original definition (RFC 2279), which allows for +character values up to 0x7fffffff. Each character is placed in one 16-bit or +32-bit code unit (in the 16-bit case, values greater than 0xffff cause an error +to occur). +
++UTF-8 (in its original definition) is not capable of encoding values greater +than 0x7fffffff, but such values can be handled by the 32-bit library. When +testing this library in non-UTF mode with utf8_input set, if any +character is preceded by the byte 0xff (which is an invalid byte in UTF-8) +0x80000000 is added to the character's value. This is the only way of passing +such code points in a pattern string. For subject strings, using an escape +sequence is preferable. +
++-8 +If the 8-bit library has been built, this option causes it to be used (this is +the default). If the 8-bit library has not been built, this option causes an +error. +
++-16 +If the 16-bit library has been built, this option causes it to be used. If the +8-bit library has not been built, this is the default. If the 16-bit library +has not been built, this option causes an error. +
++-32 +If the 32-bit library has been built, this option causes it to be used. If no +other library has been built, this is the default. If the 32-bit library has +not been built, this option causes an error. +
++-ac +Behave as if each pattern has the auto_callout modifier, that is, insert +automatic callouts into every pattern that is compiled. +
++-AC +As for -ac, but in addition behave as if each subject line has the +callout_extra modifier, that is, show additional information from +callouts. +
++-b +Behave as if each pattern has the fullbincode modifier; the full +internal binary form of the pattern is output after compilation. +
++-C +Output the version number of the PCRE2 library, and all available information +about the optional features that are included, and then exit with zero exit +code. All other options are ignored. If both -C and -LM are present, whichever +is first is recognized. +
++-C option +Output information about a specific build-time option, then exit. This +functionality is intended for use in scripts such as RunTest. The +following options output the value and set the exit code as indicated: +
+ ebcdic-nl the code for LF (= NL) in an EBCDIC environment: + 0x15 or 0x25 + 0 if used in an ASCII environment + exit code is always 0 + linksize the configured internal link size (2, 3, or 4) + exit code is set to the link size + newline the default newline setting: + CR, LF, CRLF, ANYCRLF, ANY, or NUL + exit code is always 0 + bsr the default setting for what \R matches: + ANYCRLF or ANY + exit code is always 0 ++The following options output 1 for true or 0 for false, and set the exit code +to the same value: +
+ backslash-C \C is supported (not locked out) + ebcdic compiled for an EBCDIC environment + jit just-in-time support is available + pcre2-16 the 16-bit library was built + pcre2-32 the 32-bit library was built + pcre2-8 the 8-bit library was built + unicode Unicode support is available ++If an unknown option is given, an error message is output; the exit code is 0. + +
+-d +Behave as if each pattern has the debug modifier; the internal +form and information about the compiled pattern is output after compilation; +-d is equivalent to -b -i. +
++-dfa +Behave as if each subject line has the dfa modifier; matching is done +using the pcre2_dfa_match() function instead of the default +pcre2_match(). +
++-error number[,number,...] +Call pcre2_get_error_message() for each of the error numbers in the +comma-separated list, display the resulting messages on the standard output, +then exit with zero exit code. The numbers may be positive or negative. This is +a convenience facility for PCRE2 maintainers. +
++-help +Output a brief summary these options and then exit. +
++-i +Behave as if each pattern has the info modifier; information about the +compiled pattern is given after compilation. +
++-jit +Behave as if each pattern line has the jit modifier; after successful +compilation, each pattern is passed to the just-in-time compiler, if available. +
++-jitfast +Behave as if each pattern line has the jitfast modifier; after +successful compilation, each pattern is passed to the just-in-time compiler, if +available, and each subject line is passed directly to the JIT matcher via its +"fast path". +
++-jitverify +Behave as if each pattern line has the jitverify modifier; after +successful compilation, each pattern is passed to the just-in-time compiler, if +available, and the use of JIT for matching is verified. +
++-LM +List modifiers: write a list of available pattern and subject modifiers to the +standard output, then exit with zero exit code. All other options are ignored. +If both -C and any -Lx options are present, whichever is first is recognized. +
++-LP +List properties: write a list of recognized Unicode properties to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. +
++-LS +List scripts: write a list of recognized Unicode script names to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. +
++-pattern modifier-list +Behave as if each pattern line contains the given modifiers. +
++-q +Do not output the version number of pcre2test at the start of execution. +
++-S size +On Unix-like systems, set the size of the run-time stack to size +mebibytes (units of 1024*1024 bytes). +
++-subject modifier-list +Behave as if each subject line contains the given modifiers. +
++-t +Run each compile and match many times with a timer, and output the resulting +times per compile or match. When JIT is used, separate times are given for the +initial compile and the JIT compile. You can control the number of iterations +that are used for timing by following -t with a number (as a separate +item on the command line). For example, "-t 1000" iterates 1000 times. The +default is to iterate 500,000 times. +
++-tm +This is like -t except that it times only the matching phase, not the +compile phase. +
++-T -TM +These behave like -t and -tm, but in addition, at the end of a run, +the total times for all compiles and matches are output. +
++-version +Output the PCRE2 version number and then exit. +
++If pcre2test is given two filename arguments, it reads from the first and +writes to the second. If the first name is "-", input is taken from the +standard input. If pcre2test is given only one argument, it reads from +that file and writes to stdout. Otherwise, it reads from stdin and writes to +stdout. +
++When pcre2test is built, a configuration option can specify that it +should be linked with the libreadline or libedit library. When this +is done, if the input is from a terminal, it is read using the readline() +function. This provides line-editing and history facilities. The output from +the -help option states whether or not readline() will be used. +
++The program handles any number of tests, each of which consists of a set of +input lines. Each set starts with a regular expression pattern, followed by any +number of subject lines to be matched against that pattern. In between sets of +test data, command lines that begin with # may appear. This file format, with +some restrictions, can also be processed by the perltest.sh script that +is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 +and Perl is the same. For a specification of perltest.sh, see the +comments near its beginning. See also the #perltest command below. +
++When the input is a terminal, pcre2test prompts for each line of input, +using "re>" to prompt for regular expression patterns, and "data>" to prompt +for subject lines. Command lines starting with # can be entered only in +response to the "re>" prompt. +
++Each subject line is matched separately and independently. If you want to do +multi-line matches, you have to use the \n escape sequence (or \r or \r\n, +etc., depending on the newline setting) in a single line of input to encode the +newline sequences. There is no limit on the length of subject lines; the input +buffer is automatically extended if it is too small. There are replication +features that makes it possible to generate long repetitive pattern or subject +lines without having to supply them explicitly. +
++An empty line or the end of the file signals the end of the subject lines for a +test, at which point a new pattern or command line is expected if there is +still input to be read. +
++In between sets of test data, a line that begins with # is interpreted as a +command line. If the first character is followed by white space or an +exclamation mark, the line is treated as a comment, and ignored. Otherwise, the +following commands are recognized: +
+ #forbid_utf ++Subsequent patterns automatically have the PCRE2_NEVER_UTF and PCRE2_NEVER_UCP +options set, which locks out the use of the PCRE2_UTF and PCRE2_UCP options and +the use of (*UTF) and (*UCP) at the start of patterns. This command also forces +an error if a subsequent pattern contains any occurrences of \P, \p, or \X, +which are still supported when PCRE2_UTF is not set, but which require Unicode +property support to be included in the library. + +
+This is a trigger guard that is used in test files to ensure that UTF or +Unicode property tests are not accidentally added to files that are used when +Unicode support is not included in the library. Setting PCRE2_NEVER_UTF and +PCRE2_NEVER_UCP as a default can also be obtained by the use of #pattern; +the difference is that #forbid_utf cannot be unset, and the automatic +options are not displayed in pattern information, to avoid cluttering up test +output. +
+ #load <filename> ++This command is used to load a set of precompiled patterns from a file, as +described in the section entitled "Saving and restoring compiled patterns" +below. +
+ #loadtables <filename> ++This command is used to load a set of binary character tables that can be +accessed by the tables=3 qualifier. Such tables can be created by the +pcre2_dftables program with the -b option. +
+ #newline_default [<newline-list>] ++When PCRE2 is built, a default newline convention can be specified. This +determines which characters and/or character pairs are recognized as indicating +a newline in a pattern or subject string. The default can be overridden when a +pattern is compiled. The standard test files contain tests of various newline +conventions, but the majority of the tests expect a single linefeed to be +recognized as a newline by default. Without special action the tests would fail +when PCRE2 is compiled with either CR or CRLF as the default newline. + +
+The #newline_default command specifies a list of newline types that are +acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, +ANY, or NUL (in upper or lower case), for example: +
+ #newline_default LF Any anyCRLF ++If the default newline is in the list, this command has no effect. Otherwise, +except when testing the POSIX API, a newline modifier that specifies the +first newline convention in the list (LF in the above example) is added to any +pattern that does not already have a newline modifier. If the newline +list is empty, the feature is turned off. This command is present in a number +of the standard test input files. + +
+When the POSIX API is being tested there is no way to override the default +newline convention, though it is possible to set the newline convention from +within the pattern. A warning is given if the posix or posix_nosub +modifier is used when #newline_default would set a default for the +non-POSIX API. +
+ #pattern <modifier-list> ++This command sets a default modifier list that applies to all subsequent +patterns. Modifiers on a pattern can change these settings. +
+ #perltest ++This line is used in test files that can also be processed by perltest.sh +to confirm that Perl gives the same results as PCRE2. Subsequent tests are +checked for the use of pcre2test features that are incompatible with the +perltest.sh script. + +
+Patterns must use '/' as their delimiter, and only certain modifiers are +supported. Comment lines, #pattern commands, and #subject commands that set or +unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and +#newline_default commands, which are needed in the relevant pcre2test files, +are silently ignored. All other command lines are ignored, but give a warning +message. The #perltest command helps detect tests that are accidentally +put in the wrong file or use the wrong delimiter. For more details of the +perltest.sh script see the comments it contains. +
+ #pop [<modifiers>] + #popcopy [<modifiers>] ++These commands are used to manipulate the stack of compiled patterns, as +described in the section entitled "Saving and restoring compiled patterns" +below. +
+ #save <filename> ++This command is used to save a set of compiled patterns to a file, as described +in the section entitled "Saving and restoring compiled patterns" +below. +
+ #subject <modifier-list> ++This command sets a default modifier list that applies to all subsequent +subject lines. Modifiers on a subject line can change these settings. + +
+Modifier lists are used with both pattern and subject lines. Items in a list +are separated by commas followed by optional white space. Trailing whitespace +in a modifier list is ignored. Some modifiers may be given for both patterns +and subject lines, whereas others are valid only for one or the other. Each +modifier has a long name, for example "anchored", and some of them must be +followed by an equals sign and a value, for example, "offset=12". Values cannot +contain comma characters, but may contain spaces. Modifiers that do not take +values may be preceded by a minus sign to turn off a previous setting. +
++A few of the more common modifiers can also be specified as single letters, for +example "i" for "caseless". In documentation, following the Perl convention, +these are written with a slash ("the /i modifier") for clarity. Abbreviated +modifiers must all be concatenated in the first item of a modifier list. If the +first item is not recognized as a long modifier name, it is interpreted as a +sequence of these abbreviations. For example: +
+ /abc/ig,newline=cr,jit=3 ++This is a pattern line whose modifier list starts with two one-letter modifiers +(/i and /g). The lower-case abbreviated modifiers are the same as used in Perl. + +
+A pattern line must start with one of the following characters (common symbols, +excluding pattern meta-characters): +
+ / ! " ' ` - = _ : ; , % & @ ~ ++This is interpreted as the pattern's delimiter. A regular expression may be +continued over several input lines, in which case the newline characters are +included within it. It is possible to include the delimiter as a literal within +the pattern by escaping it with a backslash, for example +
+ /abc\/def/ ++If you do this, the escape and the delimiter form part of the pattern, but +since the delimiters are all non-alphanumeric, the inclusion of the backslash +does not affect the pattern's interpretation. Note, however, that this trick +does not work within \Q...\E literal bracketing because the backslash will +itself be interpreted as a literal. If the terminating delimiter is immediately +followed by a backslash, for example, +
+ /abc/\ ++a backslash is added to the end of the pattern. This is done to provide a way +of testing the error condition that arises if a pattern finishes with a +backslash, because +
+ /abc\/ ++is interpreted as the first line of a pattern that starts with "abc/", causing +pcre2test to read the next line as a continuation of the regular expression. + +
+A pattern can be followed by a modifier list (details below). +
++Before each subject line is passed to pcre2_match(), +pcre2_dfa_match(), or pcre2_jit_match(), leading and trailing white +space is removed, and the line is scanned for backslash escapes, unless the +subject_literal modifier was set for the pattern. The following provide a +means of encoding non-printing characters in a visible way: +
+ \a alarm (BEL, \x07)
+ \b backspace (\x08)
+ \e escape (\x27)
+ \f form feed (\x0c)
+ \n newline (\x0a)
+ \r carriage return (\x0d)
+ \t tab (\x09)
+ \v vertical tab (\x0b)
+ \nnn octal character (up to 3 octal digits); always
+ a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode
+ \o{dd...} octal character (any number of octal digits}
+ \xhh hexadecimal byte (up to 2 hex digits)
+ \x{hh...} hexadecimal character (any number of hex digits)
+
+The use of \x{hh...} is not dependent on the use of the utf modifier on
+the pattern. It is recognized always. There may be any number of hexadecimal
+digits inside the braces; invalid values provoke error messages.
+
++Note that \xhh specifies one byte rather than one character in UTF-8 mode; +this makes it possible to construct invalid UTF-8 sequences for testing +purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in +UTF-8 mode, generating more than one byte if the value is greater than 127. +When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte +for values less than 256, and causes an error for greater values. +
++In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it +possible to construct invalid UTF-16 sequences for testing purposes. +
++In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it +possible to construct invalid UTF-32 sequences for testing purposes. +
++There is a special backslash sequence that specifies replication of one or more +characters: +
+ \[<characters>]{<count>}
+
+This makes it possible to test long strings without having to provide them as
+part of the file. For example:
+
+ \[abc]{4}
+
+is converted to "abcabcabcabc". This feature does not support nesting. To
+include a closing square bracket in the characters, code it as \x5D.
+
++A backslash followed by an equals sign marks the end of the subject string and +the start of a modifier list. For example: +
+ abc\=notbol,notempty ++If the subject string is empty and \= is followed by whitespace, the line is +treated as a comment line, and is not used for matching. For example: +
+ \= This is a comment. + abc\= This is an invalid modifier list. ++A backslash followed by any other non-alphanumeric character just escapes that +character. A backslash followed by anything else causes an error. However, if +the very last character in the line is a backslash (and there is no modifier +list), it is ignored. This gives a way of passing an empty line as data, since +a real empty line terminates the data input. + +
+If the subject_literal modifier is set for a pattern, all subject lines +that follow are treated as literals, with no special treatment of backslashes. +No replication is possible, and any subject modifiers must be set as defaults +by a #subject command. +
++There are several types of modifier that can appear in pattern lines. Except +where noted below, they may also be used in #pattern commands. A +pattern's modifier list can add to or override default modifiers that were set +by a previous #pattern command. +
++The following modifiers set options for pcre2_compile(). Most of them set +bits in the options argument of that function, but those whose names start with +PCRE2_EXTRA are additional options that are set in the compile context. +Some of these options have single-letter abbreviations. There is special +handling for /x: if a second x is present, PCRE2_EXTENDED is converted into +PCRE2_EXTENDED_MORE as in Perl. A third appearance adds PCRE2_EXTENDED as well, +though this makes no difference to the way pcre2_compile() behaves. See +pcre2api +for a description of the effects of these options. +
+ allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS + allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK + allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES + alt_bsux set PCRE2_ALT_BSUX + alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_verbnames set PCRE2_ALT_VERBNAMES + anchored set PCRE2_ANCHORED + /a ascii_all set all ASCII options + ascii_bsd set PCRE2_EXTRA_ASCII_BSD + ascii_bss set PCRE2_EXTRA_ASCII_BSS + ascii_bsw set PCRE2_EXTRA_ASCII_BSW + ascii_digit set PCRE2_EXTRA_ASCII_DIGIT + ascii_posix set PCRE2_EXTRA_ASCII_POSIX + auto_callout set PCRE2_AUTO_CALLOUT + bad_escape_is_literal set PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL + /i caseless set PCRE2_CASELESS + /r caseless_restrict set PCRE2_EXTRA_CASELESS_RESTRICT + dollar_endonly set PCRE2_DOLLAR_ENDONLY + /s dotall set PCRE2_DOTALL + dupnames set PCRE2_DUPNAMES + endanchored set PCRE2_ENDANCHORED + escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF + /x extended set PCRE2_EXTENDED + /xx extended_more set PCRE2_EXTENDED_MORE + extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX + firstline set PCRE2_FIRSTLINE + literal set PCRE2_LITERAL + match_line set PCRE2_EXTRA_MATCH_LINE + match_invalid_utf set PCRE2_MATCH_INVALID_UTF + match_unset_backref set PCRE2_MATCH_UNSET_BACKREF + match_word set PCRE2_EXTRA_MATCH_WORD + /m multiline set PCRE2_MULTILINE + never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_ucp set PCRE2_NEVER_UCP + never_utf set PCRE2_NEVER_UTF + /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE + no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR + no_start_optimize set PCRE2_NO_START_OPTIMIZE + no_utf_check set PCRE2_NO_UTF_CHECK + ucp set PCRE2_UCP + ungreedy set PCRE2_UNGREEDY + use_offset_limit set PCRE2_USE_OFFSET_LIMIT + utf set PCRE2_UTF ++As well as turning on the PCRE2_UTF option, the utf modifier causes all +non-printing characters in output strings to be printed using the \x{hh...} +notation. Otherwise, those less than 0x100 are output in hex without the curly +brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and +subject strings to be translated to UTF-16 or UTF-32, respectively, before +being passed to library functions. + +
+The following modifiers affect the compilation process or request information +about the pattern. There are single-letter abbreviations for some that are +heavily used in the test files. +
+ bsr=[anycrlf|unicode] specify \R handling + /B bincode show binary code without lengths + callout_info show callout information + convert=<options> request foreign pattern conversion + convert_glob_escape=c set glob escape character + convert_glob_separator=c set glob separator character + convert_length set convert buffer length + debug same as info,fullbincode + framesize show matching frame size + fullbincode show binary code with lengths + /I info show info about compiled pattern + hex unquoted characters are hexadecimal + jit[=<number>] use JIT + jitfast use JIT fast path + jitverify verify JIT use + locale=<name> use this locale + max_pattern_compiled ) set maximum compiled pattern + _length=<n> ) length (bytes) + max_pattern_length=<n> set maximum pattern length (code units) + max_varlookbehind=<n> set maximum variable lookbehind length + memory show memory used + newline=<type> set newline type + null_context compile with a NULL context + null_pattern pass pattern as NULL + parens_nest_limit=<n> set maximum parentheses depth + posix use the POSIX API + posix_nosub use the POSIX API with REG_NOSUB + push push compiled pattern onto the stack + pushcopy push a copy onto the stack + stackguard=<number> test the stackguard feature + subject_literal treat all subject lines as literal + tables=[0|1|2|3] select internal tables + use_length do not zero-terminate the pattern + utf8_input treat input as UTF-8 ++The effects of these modifiers are described in the following sections. + +
+The bsr modifier specifies what \R in a pattern should match. If it is +set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to "unicode", +\R matches any Unicode newline sequence. The default can be specified when +PCRE2 is built; if it is not, the default is set to Unicode. +
++The newline modifier specifies which characters are to be interpreted as +newlines, both in the pattern and in subject lines. The type must be one of CR, +LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case). +
++The debug modifier is a shorthand for info,fullbincode, requesting +all available information. +
++The bincode modifier causes a representation of the compiled code to be +output after compilation. This information does not contain length and offset +values, which ensures that the same output is generated for different internal +link sizes and different code unit widths. By using bincode, the same +regression tests can be used in different environments. +
++The fullbincode modifier, by contrast, does include length and +offset values. This is used in a few special tests that run only for specific +code unit widths and link sizes, and is also useful for one-off tests. +
++The info modifier requests information about the compiled pattern +(whether it is anchored, has a fixed first character, and so on). The +information is obtained from the pcre2_pattern_info() function. Here are +some typical examples: +
+ re> /(?i)(^a|^b)/m,info + Capture group count = 1 + Compile options: multiline + Overall options: caseless multiline + First code unit at start or follows newline + Subject length lower bound = 1 + + re> /(?i)abc/info + Capture group count = 0 + Compile options: <none> + Overall options: caseless + First code unit = 'a' (caseless) + Last code unit = 'c' (caseless) + Subject length lower bound = 3 ++"Compile options" are those specified by modifiers; "overall options" have +added options that are taken or deduced from the pattern. If both sets of +options are the same, just a single "options" line is output; if there are no +options, the line is omitted. "First code unit" is where any match must start; +if there is more than one they are listed as "starting code units". "Last code +unit" is the last literal code unit that must be present in any match. This is +not necessarily the last character. These lines are omitted if no starting or +ending code units are recorded. The subject length line is omitted when +no_start_optimize is set because the minimum length is not calculated +when it can never be used. + +
+The framesize modifier shows the size, in bytes, of each storage frame +used by pcre2_match() for handling backtracking. The size depends on the +number of capturing parentheses in the pattern. A vector of these frames is +used at matching time; its overall size is shown when the heaframes_size +subject modifier is set. +
++The callout_info modifier requests information about all the callouts in +the pattern. A list of them is output at the end of any other information that +is requested. For each callout, either its number or string is given, followed +by the item that follows it in the pattern. +
++Normally, pcre2test passes a context block to pcre2_compile(). If +the null_context modifier is set, however, NULL is passed. This is for +testing that pcre2_compile() behaves correctly in this case (it uses +default values). +
++The null_pattern modifier is for testing the behaviour of +pcre2_compile() when the pattern argument is NULL. The length value +passed is the default PCRE2_ZERO_TERMINATED unless use_length is set. +Any length other than zero causes an error. +
++The hex modifier specifies that the characters of the pattern, except for +substrings enclosed in single or double quotes, are to be interpreted as pairs +of hexadecimal digits. This feature is provided as a way of creating patterns +that contain binary zeros and other non-printing characters. White space is +permitted between pairs of digits. For example, this pattern contains three +characters: +
+ /ab 32 59/hex ++Parts of such a pattern are taken literally if quoted. This pattern contains +nine characters, only two of which are specified in hexadecimal: +
+ /ab "literal" 32/hex ++Either single or double quotes may be used. There is no way of including +the delimiter within a substring. The hex and expand modifiers are +mutually exclusive. + +
+By default, patterns are passed to the compiling functions as zero-terminated +strings but can be passed by length instead of being zero-terminated. The +use_length modifier causes this to happen. Using a length happens +automatically (whether or not use_length is set) when hex is set, +because patterns specified in hexadecimal may contain binary zeros. +
++If hex or use_length is used with the POSIX wrapper API (see +"Using the POSIX wrapper API" +below), the REG_PEND extension is used to pass the pattern's length. +
++Variable lookbehind assertions are supported only if, for each one, there is a +maximum length (in characters) that it can match. There is a limit on this, +whose default can be set at build time, with an ultimate default of 255. The +max_varlookbehind modifier uses the pcre2_set_max_varlookbehind() +function to change the limit. Lookbehinds whose branches each match a fixed +length are limited to 65535 characters per branch. +
++In 16-bit and 32-bit modes, all input is automatically treated as UTF-8 and +translated to UTF-16 or UTF-32 when the utf modifier is set. For testing +the 16-bit and 32-bit libraries in non-UTF mode, the utf8_input modifier +can be used. It is mutually exclusive with utf. Input lines are +interpreted as UTF-8 as a means of specifying wide characters. More details are +given in +"Input encoding" +above. +
++Some tests use long patterns that are very repetitive. Instead of creating a +very long input line for such a pattern, you can use a special repetition +feature, similar to the one described for subject lines above. If the +expand modifier is present on a pattern, parts of the pattern that have +the form +
+ \[<characters>]{<count>}
+
+are expanded before the pattern is passed to pcre2_compile(). For
+example, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
+cannot be nested. An initial "\[" sequence is recognized only if "]{" followed
+by decimal digits and "}" is found later in the pattern. If not, the characters
+remain in the pattern unaltered. The expand and hex modifiers are
+mutually exclusive.
+
++If part of an expanded pattern looks like an expansion, but is really part of +the actual pattern, unwanted expansion can be avoided by giving two values in +the quantifier. For example, \[AB]{6000,6000} is not recognized as an +expansion item. +
++If the info modifier is set on an expanded pattern, the result of the +expansion is included in the information that is output. +
++Just-in-time (JIT) compiling is a heavyweight optimization that can greatly +speed up pattern matching. See the +pcre2jit +documentation for details. JIT compiling happens, optionally, after a pattern +has been successfully compiled into an internal form. The JIT compiler converts +this to optimized machine code. It needs to know whether the match-time options +PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, because +different code is generated for the different cases. See the partial +modifier in "Subject Modifiers" +below +for details of how these options are specified for each match attempt. +
++JIT compilation is requested by the jit pattern modifier, which may +optionally be followed by an equals sign and a number in the range 0 to 7. +The three bits that make up the number specify which of the three JIT operating +modes are to be compiled: +
+ 1 compile JIT code for non-partial matching + 2 compile JIT code for soft partial matching + 4 compile JIT code for hard partial matching ++The possible values for the jit modifier are therefore: +
+ 0 disable JIT + 1 normal matching only + 2 soft partial matching only + 3 normal and soft partial matching + 4 hard partial matching only + 6 soft and hard partial matching only + 7 all three modes ++If no number is given, 7 is assumed. The phrase "partial matching" means a call +to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the +PCRE2_PARTIAL_HARD option set. Note that such a call may return a complete +match; the options enable the possibility of a partial match, but do not +require it. Note also that if you request JIT compilation only for partial +matching (for example, jit=2) but do not set the partial modifier on a +subject line, that match will not use JIT code because none was compiled for +non-partial matching. + +
+If JIT compilation is successful, the compiled JIT code will automatically be +used when an appropriate type of match is run, except when incompatible +run-time options are specified. For more details, see the +pcre2jit +documentation. See also the jitstack modifier below for a way of +setting the size of the JIT stack. +
++If the jitfast modifier is specified, matching is done using the JIT +"fast path" interface, pcre2_jit_match(), which skips some of the sanity +checks that are done by pcre2_match(), and of course does not work when +JIT is not supported. If jitfast is specified without jit, jit=7 is +assumed. +
++If the jitverify modifier is specified, information about the compiled +pattern shows whether JIT compilation was or was not successful. If +jitverify is specified without jit, jit=7 is assumed. If JIT +compilation is successful when jitverify is set, the text "(JIT)" is +added to the first output line after a match or non match when JIT-compiled +code was actually used in the match. +
++The locale modifier must specify the name of a locale, for example: +
+ /pattern/locale=fr_FR ++The given locale is set, pcre2_maketables() is called to build a set of +character tables for the locale, and this is then passed to +pcre2_compile() when compiling the regular expression. The same tables +are used when matching the following subject lines. The locale modifier +applies only to the pattern on which it appears, but can be given in a +#pattern command if a default is needed. Setting a locale and alternate +character tables are mutually exclusive. + +
+The memory modifier causes the size in bytes of the memory used to hold +the compiled pattern to be output. This does not include the size of the +pcre2_code block; it is just the actual compiled data. If the pattern is +subsequently passed to the JIT compiler, the size of the JIT compiled code is +also output. Here is an example: +
+ re> /a(b)c/jit,memory + Memory allocation (code space): 21 + Memory allocation (JIT code): 1910 + ++ +
+The parens_nest_limit modifier sets a limit on the depth of nested +parentheses in a pattern. Breaching the limit causes a compilation error. +The default for the library is set when PCRE2 is built, but pcre2test +sets its own default of 220, which is required for running the standard test +suite. +
++The max_pattern_length modifier sets a limit, in code units, to the +length of pattern that pcre2_compile() will accept. Breaching the limit +causes a compilation error. The default is the largest number a PCRE2_SIZE +variable can hold (essentially unlimited). +
++The max_pattern_compiled_length modifier sets a limit, in bytes, to the +amount of memory used by a compiled pattern. Breaching the limit causes a +compilation error. The default is the largest number a PCRE2_SIZE variable can +hold (essentially unlimited). +
++The posix and posix_nosub modifiers cause pcre2test to call +PCRE2 via the POSIX wrapper API rather than its native API. When +posix_nosub is used, the POSIX option REG_NOSUB is passed to +regcomp(). The POSIX wrapper supports only the 8-bit library. Note that +it does not imply POSIX matching semantics; for more detail see the +pcre2posix +documentation. The following pattern modifiers set options for the +regcomp() function: +
+ caseless REG_ICASE + multiline REG_NEWLINE + dotall REG_DOTALL ) + ungreedy REG_UNGREEDY ) These options are not part of + ucp REG_UCP ) the POSIX standard + utf REG_UTF8 ) ++The regerror_buffsize modifier specifies a size for the error buffer that +is passed to regerror() in the event of a compilation error. For example: +
+ /abc/posix,regerror_buffsize=20 ++This provides a means of testing the behaviour of regerror() when the +buffer is too small for the error message. If this modifier has not been set, a +large buffer is used. + +
+The aftertext and allaftertext subject modifiers work as described +below. All other modifiers are either ignored, with a warning message, or cause +an error. +
++The pattern is passed to regcomp() as a zero-terminated string by +default, but if the use_length or hex modifiers are set, the +REG_PEND extension is used to pass it by length. +
++The stackguard modifier is used to test the use of +pcre2_set_compile_recursion_guard(), a function that is provided to +enable stack availability to be checked during compilation (see the +pcre2api +documentation for details). If the number specified by the modifier is greater +than zero, pcre2_set_compile_recursion_guard() is called to set up +callback from pcre2_compile() to a local function. The argument it +receives is the current nesting parenthesis depth; if this is greater than the +value given by the modifier, non-zero is returned, causing the compilation to +be aborted. +
++The value specified for the tables modifier must be one of the digits 0, +1, 2, or 3. It causes a specific set of built-in character tables to be passed +to pcre2_compile(). This is used in the PCRE2 tests to check behaviour +with different character tables. The digit specifies the tables as follows: +
+ 0 do not pass any special character tables + 1 the default ASCII tables, as distributed in + pcre2_chartables.c.dist + 2 a set of tables defining ISO 8859 characters + 3 a set of tables loaded by the #loadtables command ++In tables 2, some characters whose codes are greater than 128 are identified as +letters, digits, spaces, etc. Tables 3 can be used only after a +#loadtables command has loaded them from a binary file. Setting alternate +character tables and a locale are mutually exclusive. + +
+The following modifiers are really subject modifiers, and are described under +"Subject Modifiers" below. However, they may be included in a pattern's +modifier list, in which case they are applied to every subject line that is +processed with that pattern. These modifiers do not affect the compilation +process. +
+ aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allvector show the entire ovector + allusedtext show all consulted text + altglobal alternative global matching + /g global global matching + heapframes_size show match data heapframes size + jitstack=<n> set size of JIT stack + mark show mark values + replace=<string> specify a replacement string + startchar show starting character when relevant + substitute_callout use substitution callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED + substitute_literal use PCRE2_SUBSTITUTE_LITERAL + substitute_matched use PCRE2_SUBSTITUTE_MATCHED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY + substitute_skip=<n> skip substitution <n> + substitute_stop=<n> skip substitution <n> and following + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY ++These modifiers may not appear in a #pattern command. If you want them as +defaults, set them in a #subject command. + +
+If the subject_literal modifier is present on a pattern, all the subject +lines that it matches are taken as literal strings, with no interpretation of +backslashes. It is not possible to set subject modifiers on such lines, but any +that are set as defaults by a #subject command are recognized. +
++When a pattern with the push modifier is successfully compiled, it is +pushed onto a stack of compiled patterns, and pcre2test expects the next +line to contain a new pattern (or a command) instead of a subject line. This +facility is used when saving compiled patterns to a file, as described in the +section entitled "Saving and restoring compiled patterns" +below. +If pushcopy is used instead of push, a copy of the compiled +pattern is stacked, leaving the original as current, ready to match the +following input lines. This provides a way of testing the +pcre2_code_copy() function. +The push and pushcopy modifiers are incompatible with compilation +modifiers such as global that act at match time. Any that are specified +are ignored (for the stacked copy), with a warning message, except for +replace, which causes an error. Note that jitverify, which is +allowed, does not carry through to any subsequent matching that uses a stacked +pattern. +
++The experimental foreign pattern conversion functions in PCRE2 can be tested by +setting the convert modifier. Its argument is a colon-separated list of +options, which set the equivalent option for the pcre2_pattern_convert() +function: +
+ glob PCRE2_CONVERT_GLOB + glob_no_starstar PCRE2_CONVERT_GLOB_NO_STARSTAR + glob_no_wild_separator PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + posix_basic PCRE2_CONVERT_POSIX_BASIC + posix_extended PCRE2_CONVERT_POSIX_EXTENDED + unset Unset all options ++The "unset" value is useful for turning off a default that has been set by a +#pattern command. When one of these options is set, the input pattern is +passed to pcre2_pattern_convert(). If the conversion is successful, the +result is reflected in the output and then passed to pcre2_compile(). The +normal utf and no_utf_check options, if set, cause the +PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to +pcre2_pattern_convert(). + +
+By default, the conversion function is allowed to allocate a buffer for its +output. However, if the convert_length modifier is set to a value greater +than zero, pcre2test passes a buffer of the given length. This makes it +possible to test the length check. +
++The convert_glob_escape and convert_glob_separator modifiers can be +used to specify the escape and separator characters for glob processing, +overriding the defaults, which are operating-system dependent. +
++The modifiers that can appear in subject lines and the #subject +command are of two types. +
++The following modifiers set options for pcre2_match() or +pcre2_dfa_match(). See +pcreapi +for a description of their effects. +
+ anchored set PCRE2_ANCHORED + endanchored set PCRE2_ENDANCHORED + dfa_restart set PCRE2_DFA_RESTART + dfa_shortest set PCRE2_DFA_SHORTEST + disable_recurseloop_check set PCRE2_DISABLE_RECURSELOOP_CHECK + no_jit set PCRE2_NO_JIT + no_utf_check set PCRE2_NO_UTF_CHECK + notbol set PCRE2_NOTBOL + notempty set PCRE2_NOTEMPTY + notempty_atstart set PCRE2_NOTEMPTY_ATSTART + noteol set PCRE2_NOTEOL + partial_hard (or ph) set PCRE2_PARTIAL_HARD + partial_soft (or ps) set PCRE2_PARTIAL_SOFT ++The partial matching modifiers are provided with abbreviations because they +appear frequently in tests. + +
+If the posix or posix_nosub modifier was present on the pattern, +causing the POSIX wrapper API to be used, the only option-setting modifiers +that have any effect are notbol, notempty, and noteol, +causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to +regexec(). The other modifiers are ignored, with a warning message. +
++There is one additional modifier that can be used with the POSIX wrapper. It is +ignored (with a warning) if used for non-POSIX matching. +
+ posix_startend=<n>[:<m>] ++This causes the subject string to be passed to regexec() using the +REG_STARTEND option, which uses offsets to specify which part of the string is +searched. If only one number is given, the end offset is passed as the end of +the subject string. For more detail of REG_STARTEND, see the +pcre2posix +documentation. If the subject string contains binary zeros (coded as escapes +such as \x{00} because pcre2test does not support actual binary zeros in +its input), you must use posix_startend to specify its length. + +
+The following modifiers affect the matching process or request additional +information. Some of them may also be specified on a pattern line (see above), +in which case they apply to every subject line that is matched against that +pattern, but can be overridden by modifiers on the subject. +
+ aftertext show text after match + allaftertext show text after captures + allcaptures show all captures + allvector show the entire ovector + allusedtext show all consulted text (non-JIT only) + altglobal alternative global matching + callout_capture show captures at callout time + callout_data=<n> set a value to pass via callouts + callout_error=<n>[:<m>] control callout error + callout_extra show extra callout information + callout_fail=<n>[:<m>] control callout failure + callout_no_where do not show position of a callout + callout_none do not supply a callout function + copy=<number or name> copy captured substring + depth_limit=<n> set a depth limit + dfa use pcre2_dfa_match() + find_limits find heap, match and depth limits + find_limits_noheap find match and depth limits + get=<number or name> extract captured substring + getall extract all captured substrings + /g global global matching + heapframes_size show match data heapframes size + heap_limit=<n> set a limit on heap memory (Kbytes) + jitstack=<n> set size of JIT stack + mark show mark values + match_limit=<n> set a match limit + memory show heap memory usage + null_context match with a NULL context + null_replacement substitute with NULL replacement + null_subject match with NULL subject + offset=<n> set starting offset + offset_limit=<n> set offset limit + ovector=<n> set size of output vector + recursion_limit=<n> obsolete synonym for depth_limit + replace=<string> specify a replacement string + startchar show startchar when relevant + startoffset=<n> same as offset=<n> + substitute_callout use substitution callouts + substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_literal use PCRE2_SUBSTITUTE_LITERAL + substitute_matched use PCRE2_SUBSTITUTE_MATCHED + substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY + substitute_skip=<n> skip substitution number n + substitute_stop=<n> skip substitution number n and greater + substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY + zero_terminate pass the subject as zero-terminated ++The effects of these modifiers are described in the following sections. When +matching via the POSIX wrapper API, the aftertext, allaftertext, +and ovector subject modifiers work as described below. All other +modifiers are either ignored, with a warning message, or cause an error. + +
+The aftertext modifier requests that as well as outputting the part of +the subject string that matched the entire pattern, pcre2test should in +addition output the remainder of the subject string. This is useful for tests +where the subject contains multiple copies of the same substring. The +allaftertext modifier requests the same action for captured substrings as +well as the main matched substring. In each case the remainder is output on the +following line with a plus character following the capture number. +
++The allusedtext modifier requests that all the text that was consulted +during a successful pattern match by the interpreter should be shown, for both +full and partial matches. This feature is not supported for JIT matching, and +if requested with JIT it is ignored (with a warning message). Setting this +modifier affects the output if there is a lookbehind at the start of a match, +or, for a complete match, a lookahead at the end, or if \K is used in the +pattern. Characters that precede or follow the start and end of the actual +match are indicated in the output by '<' or '>' characters underneath them. +Here is an example: +
+ re> /(?<=pqr)abc(?=xyz)/ + data> 123pqrabcxyz456\=allusedtext + 0: pqrabcxyz + <<< >>> + data> 123pqrabcxy\=ph,allusedtext + Partial match: pqrabcxy + <<< ++The first, complete match shows that the matched string is "abc", with the +preceding and following strings "pqr" and "xyz" having been consulted during +the match (when processing the assertions). The partial match can indicate only +the preceding string. + +
+The startchar modifier requests that the starting character for the match +be indicated, if it is different to the start of the matched string. The only +time when this occurs is when \K has been processed as part of the match. In +this situation, the output for the matched string is displayed from the +starting character instead of from the match point, with circumflex characters +under the earlier characters. For example: +
+ re> /abc\Kxyz/ + data> abcxyz\=startchar + 0: abcxyz + ^^^ ++Unlike allusedtext, the startchar modifier can be used with JIT. +However, these two modifiers are mutually exclusive. + +
+The allcaptures modifier requests that the values of all potential +captured parentheses be output after a match. By default, only those up to the +highest one actually used in the match are output (corresponding to the return +code from pcre2_match()). Groups that did not take part in the match +are output as "<unset>". This modifier is not relevant for DFA matching (which +does no capturing) and does not apply when replace is specified; it is +ignored, with a warning message, if present. +
++The allvector modifier requests that the entire ovector be shown, +whatever the outcome of the match. Compare allcaptures, which shows only +up to the maximum number of capture groups for the pattern, and then only for a +successful complete non-DFA match. This modifier, which acts after any match +result, and also for DFA matching, provides a means of checking that there are +no unexpected modifications to ovector fields. Before each match attempt, the +ovector is filled with a special value, and if this is found in both elements +of a capturing pair, "<unchanged>" is output. After a successful match, this +applies to all groups after the maximum capture group for the pattern. In other +cases it applies to the entire ovector. After a partial match, the first two +elements are the only ones that should be set. After a DFA match, the amount of +ovector that is used depends on the number of matches that were found. +
++A callout function is supplied when pcre2test calls the library matching +functions, unless callout_none is specified. Its behaviour can be +controlled by various modifiers listed above whose names begin with +callout_. Details are given in the section entitled "Callouts" +below. +Testing callouts from pcre2_substitute() is described separately in +"Testing the substitution function" +below. +
++Searching for all possible matches within a subject can be requested by the +global or altglobal modifier. After finding a match, the matching +function is called again to search the remainder of the subject. The difference +between global and altglobal is that the former uses the +start_offset argument to pcre2_match() or pcre2_dfa_match() +to start searching at a new point within the entire string (which is what Perl +does), whereas the latter passes over a shortened subject. This makes a +difference to the matching process if the pattern begins with a lookbehind +assertion (including \b or \B). +
++If an empty string is matched, the next match is done with the +PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for +another, non-empty, match at the same point in the subject. If this match +fails, the start offset is advanced, and the normal match is retried. This +imitates the way Perl handles such cases when using the /g modifier or +the split() function. Normally, the start offset is advanced by one +character, but if the newline convention recognizes CRLF as a newline, and the +current character is CR followed by LF, an advance of two characters occurs. +
++The copy and get modifiers can be used to test the +pcre2_substring_copy_xxx() and pcre2_substring_get_xxx() functions. +They can be given more than once, and each can specify a capture group name or +number, for example: +
+ abcd\=copy=1,copy=3,get=G1 ++If the #subject command is used to set default copy and/or get lists, +these can be unset by specifying a negative number to cancel all numbered +groups and an empty name to cancel all named groups. + +
+The getall modifier tests pcre2_substring_list_get(), which +extracts all captured substrings. +
++If the subject line is successfully matched, the substrings extracted by the +convenience functions are output with C, G, or L after the string number +instead of a colon. This is in addition to the normal full list. The string +length (that is, the return from the extraction function) is given in +parentheses after each substring, followed by the name when the extraction was +by name. +
++If the replace modifier is set, the pcre2_substitute() function is +called instead of one of the matching functions (or after one call of +pcre2_match() in the case of PCRE2_SUBSTITUTE_MATCHED). Note that +replacement strings cannot contain commas, because a comma signifies the end of +a modifier. This is not thought to be an issue in a test program. +
++Specifying a completely empty replacement string disables this modifier. +However, it is possible to specify an empty replacement by providing a buffer +length, as described below, for an otherwise empty replacement. +
++Unlike subject strings, pcre2test does not process replacement strings +for escape sequences. In UTF mode, a replacement string is checked to see if it +is a valid UTF-8 string. If so, it is correctly converted to a UTF string of +the appropriate code unit width. If it is not a valid UTF-8 string, the +individual code units are copied directly. This provides a means of passing an +invalid UTF-8 string for testing purposes. +
++The following modifiers set options (in additional to the normal match options) +for pcre2_substitute(): +
+ global PCRE2_SUBSTITUTE_GLOBAL + substitute_extended PCRE2_SUBSTITUTE_EXTENDED + substitute_literal PCRE2_SUBSTITUTE_LITERAL + substitute_matched PCRE2_SUBSTITUTE_MATCHED + substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH + substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY + substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET + substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY ++See the +pcre2api +documentation for details of these options. + +
+After a successful substitution, the modified string is output, preceded by the +number of replacements. This may be zero if there were no matches. Here is a +simple example of a substitution test: +
+ /abc/replace=xxx + =abc=abc= + 1: =xxx=abc= + =abc=abc=\=global + 2: =xxx=xxx= ++Subject and replacement strings should be kept relatively short (fewer than 256 +characters) for substitution tests, as fixed-size buffers are used. To make it +easy to test for buffer overflow, if the replacement string starts with a +number in square brackets, that number is passed to pcre2_substitute() as +the size of the output buffer, with the replacement string starting at the next +character. Here is an example that tests the edge case: +
+ /abc/ + 123abc123\=replace=[10]XYZ + 1: 123XYZ123 + 123abc123\=replace=[9]XYZ + Failed: error -47: no more memory ++The default action of pcre2_substitute() is to return +PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the +PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the +substitute_overflow_length modifier), pcre2_substitute() continues +to go through the motions of matching and substituting (but not doing any +callouts), in order to compute the size of buffer that is required. When this +happens, pcre2test shows the required buffer length (which includes space +for the trailing zero) as part of the error message. For example: +
+ /abc/substitute_overflow_length + 123abc123\=replace=[9]XYZ + Failed: error -47: no more memory: 10 code units are needed ++A replacement string is ignored with POSIX and DFA matching. Specifying partial +matching provokes an error return ("bad option value") from +pcre2_substitute(). + +
+If the substitute_callout modifier is set, a substitution callout +function is set up. The null_context modifier must not be set, because +the address of the callout function is passed in a match context. When the +callout function is called (after each substitution), details of the input +and output strings are output. For example: +
+ /abc/g,replace=<$0>,substitute_callout + abcdefabcpqr + 1(1) Old 0 3 "abc" New 0 5 "<abc>" + 2(1) Old 6 9 "abc" New 8 13 "<abc>" + 2: <abc>def<abc>pqr ++The first number on each callout line is the count of matches. The +parenthesized number is the number of pairs that are set in the ovector (that +is, one more than the number of capturing groups that were set). Then are +listed the offsets of the old substring, its contents, and the same for the +replacement. + +
+By default, the substitution callout function returns zero, which accepts the +replacement and causes matching to continue if /g was used. Two further +modifiers can be used to test other return values. If substitute_skip is +set to a value greater than zero the callout function returns +1 for the match +of that number, and similarly substitute_stop returns -1. These cause the +replacement to be rejected, and -1 causes no further matching to take place. If +either of them are set, substitute_callout is assumed. For example: +
+ /abc/g,replace=<$0>,substitute_skip=1 + abcdefabcpqr + 1(1) Old 0 3 "abc" New 0 5 "<abc> SKIPPED" + 2(1) Old 6 9 "abc" New 6 11 "<abc>" + 2: abcdef<abc>pqr + abcdefabcpqr\=substitute_stop=1 + 1(1) Old 0 3 "abc" New 0 5 "<abc> STOPPED" + 1: abcdefabcpqr ++If both are set for the same number, stop takes precedence. Only a single skip +or stop is supported, which is sufficient for testing that the feature works. + +
+The jitstack modifier provides a way of setting the maximum stack size +that is used by the just-in-time optimization code. It is ignored if JIT +optimization is not being used. The value is a number of kibibytes (units of +1024 bytes). Setting zero reverts to the default of 32KiB. Providing a stack +that is larger than the default is necessary only for very complicated +patterns. If jitstack is set non-zero on a subject line it overrides any +value that was set on the pattern. +
++The heap_limit, match_limit, and depth_limit modifiers set +the appropriate limits in the match context. These values are ignored when the +find_limits or find_limits_noheap modifier is specified. +
++If the find_limits modifier is present on a subject line, pcre2test +calls the relevant matching function several times, setting different values in +the match context via pcre2_set_heap_limit(), +pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds +the smallest value for each parameter that allows the match to complete without +a "limit exceeded" error. The match itself may succeed or fail. An alternative +modifier, find_limits_noheap, omits the heap limit. This is used in the +standard tests, because the minimum heap limit varies between systems. If JIT +is being used, only the match limit is relevant, and the other two are +automatically omitted. +
++When using this modifier, the pattern should not contain any limit settings +such as (*LIMIT_MATCH=...) within it. If such a setting is present and is +lower than the minimum matching value, the minimum value cannot be found +because pcre2_set_match_limit() etc. are only able to reduce the value of +an in-pattern limit; they cannot increase it. +
++For non-DFA matching, the minimum depth_limit number is a measure of how +much nested backtracking happens (that is, how deeply the pattern's tree is +searched). In the case of DFA matching, depth_limit controls the depth of +recursive calls of the internal function that is used for handling pattern +recursion, lookaround assertions, and atomic groups. +
++For non-DFA matching, the match_limit number is a measure of the amount +of backtracking that takes place, and learning the minimum value can be +instructive. For most simple matches, the number is quite small, but for +patterns with very large numbers of matching possibilities, it can become large +very quickly with increasing length of subject string. In the case of DFA +matching, match_limit controls the total number of calls, both recursive +and non-recursive, to the internal matching function, thus controlling the +overall amount of computing resource that is used. +
++For both kinds of matching, the heap_limit number, which is in kibibytes +(units of 1024 bytes), limits the amount of heap memory used for matching. +
++The mark modifier causes the names from backtracking control verbs that +are returned from calls to pcre2_match() to be displayed. If a mark is +returned for a match, non-match, or partial match, pcre2test shows it. +For a match, it is on a line by itself, tagged with "MK:". Otherwise, it +is added to the non-match message. +
++The memory modifier causes pcre2test to log the sizes of all heap +memory allocation and freeing calls that occur during a call to +pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory +is used only when a match requires more internal workspace that the default +allocation on the stack, so in many cases there will be no output. No heap +memory is allocated during matching with JIT. For this modifier to work, the +null_context modifier must not be set on both the pattern and the +subject, though it can be set on one or the other. +
++The heapframes_size modifier is relevant for matches using +pcre2_match() without JIT. After a match has run (whether successful or +not) the size, in bytes, of the allocated heap frames vector that is left +attached to the match data block is shown. If the matching action involved +several calls to pcre2_match() (for example, global matching or for +timing) only the final value is shown. +
++This modifier is ignored, with a warning, for POSIX or DFA matching. JIT +matching does not use the heap frames vector, so the size is always zero, +unless there was a previous non-JIT match. Note that specifing a size of zero +for the output vector (see below) causes pcre2test to free its match data +block (and associated heap frames vector) and allocate a new one. +
++The offset modifier sets an offset in the subject string at which +matching starts. Its value is a number of code units, not characters. +
++The offset_limit modifier sets a limit for unanchored matches. If a match +cannot be found starting at or before this offset in the subject, a "no match" +return is given. The data value is a number of code units, not characters. When +this modifier is used, the use_offset_limit modifier must have been set +for the pattern; if not, an error is generated. +
++The ovector modifier applies only to the subject line in which it +appears, though of course it can also be used to set a default in a +#subject command. It specifies the number of pairs of offsets that are +available for storing matching information. The default is 15. +
++A value of zero is useful when testing the POSIX API because it causes +regexec() to be called with a NULL capture vector. When not testing the +POSIX API, a value of zero is used to cause +pcre2_match_data_create_from_pattern() to be called, in order to create a +new match block of exactly the right size for the pattern. (It is not possible +to create a match block with a zero-length ovector; there is always at least +one pair of offsets.) The old match data block is freed. +
++By default, the subject string is passed to a native API matching function with +its correct length. In order to test the facility for passing a zero-terminated +string, the zero_terminate modifier is provided. It causes the length to +be passed as PCRE2_ZERO_TERMINATED. When matching via the POSIX interface, +this modifier is ignored, with a warning. +
++When testing pcre2_substitute(), this modifier also has the effect of +passing the replacement string as zero-terminated. +
++Normally, pcre2test passes a context block to pcre2_match(), +pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). +If the null_context modifier is set, however, NULL is passed. This is for +testing that the matching and substitution functions behave correctly in this +case (they use default values). This modifier cannot be used with the +find_limits, find_limits_noheap, or substitute_callout +modifiers. +
++Similarly, for testing purposes, if the null_subject or +null_replacement modifier is set, the subject or replacement string +pointers are passed as NULL, respectively, to the relevant functions. +
++By default, pcre2test uses the standard PCRE2 matching function, +pcre2_match() to match each subject line. PCRE2 also supports an +alternative matching function, pcre2_dfa_match(), which operates in a +different way, and has some restrictions. The differences between the two +functions are described in the +pcre2matching +documentation. +
++If the dfa modifier is set, the alternative matching function is used. +This function finds all possible matches at a given point in the subject. If, +however, the dfa_shortest modifier is set, processing stops after the +first match is found. This is always the shortest possible match. +
++This section describes the output when the normal matching function, +pcre2_match(), is being used. +
++When a match succeeds, pcre2test outputs the list of captured substrings, +starting with number 0 for the string that matched the whole pattern. +Otherwise, it outputs "No match" when the return is PCRE2_ERROR_NOMATCH, or +"Partial match:" followed by the partially matching substring when the +return is PCRE2_ERROR_PARTIAL. (Note that this is the +entire substring that was inspected during the partial match; it may include +characters before the actual match start if a lookbehind assertion, \K, \b, +or \B was involved.) +
++For any other return, pcre2test outputs the PCRE2 negative error number +and a short descriptive phrase. If the error is a failed UTF string check, the +code unit offset of the start of the failing character is also output. Here is +an example of an interactive pcre2test run. +
+ $ pcre2test + PCRE2 version 10.22 2016-07-29 + + re> /^abc(\d+)/ + data> abc123 + 0: abc123 + 1: 123 + data> xyz + No match ++Unset capturing substrings that are not followed by one that is set are not +shown by pcre2test unless the allcaptures modifier is specified. In +the following example, there are two capturing substrings, but when the first +data line is matched, the second, unset substring is not shown. An "internal" +unset substring is shown as "<unset>", as for the second data line. +
+ re> /(a)|(b)/ + data> a + 0: a + 1: a + data> b + 0: b + 1: <unset> + 2: b ++If the strings contain any non-printing characters, they are output as \xhh +escapes if the value is less than 256 and UTF mode is not set. Otherwise they +are output as \x{hh...} escapes. See below for the definition of non-printing +characters. If the aftertext modifier is set, the output for substring 0 +is followed by the rest of the subject string, identified by "0+" like this: +
+ re> /cat/aftertext + data> cataract + 0: cat + 0+ aract ++If global matching is requested, the results of successive matching attempts +are output in sequence, like this: +
+ re> /\Bi(\w\w)/g + data> Mississippi + 0: iss + 1: ss + 0: iss + 1: ss + 0: ipp + 1: pp ++"No match" is output only if the first match attempt fails. Here is an example +of a failure message (the offset 4 that is specified by the offset +modifier is past the end of the subject string): +
+ re> /xyz/ + data> xyz\=offset=4 + Error -24 (bad offset value) ++ +
+Note that whereas patterns can be continued over several lines (a plain ">" +prompt is used for continuations), subject lines may not. However newlines can +be included in a subject by means of the \n escape (or \r, \r\n, etc., +depending on the newline sequence setting). +
++When the alternative matching function, pcre2_dfa_match(), is used, the +output consists of a list of all the matches that start at the first point in +the subject where there is at least one match. For example: +
+ re> /(tang|tangerine|tan)/ + data> yellow tangerine\=dfa + 0: tangerine + 1: tang + 2: tan ++Using the normal matching function on this data finds only "tang". The +longest matching string is always given first (and numbered zero). After a +PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the +partially matching substring. Note that this is the entire substring that was +inspected during the partial match; it may include characters before the actual +match start if a lookbehind assertion, \b, or \B was involved. (\K is not +supported for DFA matching.) + +
+If global matching is requested, the search for further matches resumes +at the end of the longest match. For example: +
+ re> /(tang|tangerine|tan)/g + data> yellow tangerine and tangy sultana\=dfa + 0: tangerine + 1: tang + 2: tan + 0: tang + 1: tan + 0: tan ++The alternative matching function does not support substring capture, so the +modifiers that are concerned with captured substrings are not relevant. + +
+When the alternative matching function has given the PCRE2_ERROR_PARTIAL +return, indicating that the subject partially matched the pattern, you can +restart the match with additional subject data by means of the +dfa_restart modifier. For example: +
+ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ + data> 23ja\=ps,dfa + Partial match: 23ja + data> n05\=dfa,dfa_restart + 0: n05 ++For further information about partial matching, see the +pcre2partial +documentation. + +
+If the pattern contains any callout requests, pcre2test's callout +function is called during matching unless callout_none is specified. This +works with both matching functions, and with JIT, though there are some +differences in behaviour. The output for callouts with numerical arguments and +those with string arguments is slightly different. +
++By default, the callout function displays the callout number, the start and +current positions in the subject text at the callout time, and the next pattern +item to be tested. For example: +
+ --->pqrabcdef + 0 ^ ^ \d ++This output indicates that callout number 0 occurred for a match attempt +starting at the fourth character of the subject string, when the pointer was at +the seventh character, and when the next pattern item was \d. Just +one circumflex is output if the start and current positions are the same, or if +the current position precedes the start position, which can happen if the +callout is in a lookbehind assertion. + +
+Callouts numbered 255 are assumed to be automatic callouts, inserted as a +result of the auto_callout pattern modifier. In this case, instead of +showing the callout number, the offset in the pattern, preceded by a plus, is +output. For example: +
+ re> /\d?[A-E]\*/auto_callout + data> E* + --->E* + +0 ^ \d? + +3 ^ [A-E] + +8 ^^ \* + +10 ^ ^ + 0: E* ++If a pattern contains (*MARK) items, an additional line is output whenever +a change of latest mark is passed to the callout function. For example: +
+ re> /a(*MARK:X)bc/auto_callout + data> abc + --->abc + +0 ^ a + +1 ^^ (*MARK:X) + +10 ^^ b + Latest Mark: X + +11 ^ ^ c + +12 ^ ^ + 0: abc ++The mark changes between matching "a" and "b", but stays the same for the rest +of the match, so nothing more is output. If, as a result of backtracking, the +mark reverts to being unset, the text "<unset>" is output. + +
+The output for a callout with a string argument is similar, except that instead +of outputting a callout number before the position indicators, the callout +string and its offset in the pattern string are output before the reflection of +the subject string, and the subject string is reflected for each callout. For +example: +
+ re> /^ab(?C'first')cd(?C"second")ef/ + data> abcdefg + Callout (7): 'first' + --->abcdefg + ^ ^ c + Callout (20): "second" + --->abcdefg + ^ ^ e + 0: abcdef + ++ +
+The callout function in pcre2test returns zero (carry on matching) by +default, but you can use a callout_fail modifier in a subject line to +change this and other parameters of the callout (see below). +
++If the callout_capture modifier is set, the current captured groups are +output when a callout occurs. This is useful only for non-DFA matching, as +pcre2_dfa_match() does not support capturing, so no captures are ever +shown. +
++The normal callout output, showing the callout number or pattern offset (as +described above) is suppressed if the callout_no_where modifier is set. +
++When using the interpretive matching function pcre2_match() without JIT, +setting the callout_extra modifier causes additional output from +pcre2test's callout function to be generated. For the first callout in a +match attempt at a new starting position in the subject, "New match attempt" is +output. If there has been a backtrack since the last callout (or start of +matching if this is the first callout), "Backtrack" is output, followed by "No +other matching paths" if the backtrack ended the previous match attempt. For +example: +
+ re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess + data> aac\=callout_extra + New match attempt + --->aac + +0 ^ ( + +1 ^ a+ + +3 ^ ^ ) + +4 ^ ^ b + Backtrack + --->aac + +3 ^^ ) + +4 ^^ b + Backtrack + No other matching paths + New match attempt + --->aac + +0 ^ ( + +1 ^ a+ + +3 ^^ ) + +4 ^^ b + Backtrack + No other matching paths + New match attempt + --->aac + +0 ^ ( + +1 ^ a+ + Backtrack + No other matching paths + New match attempt + --->aac + +0 ^ ( + +1 ^ a+ + No match ++Notice that various optimizations must be turned off if you want all possible +matching paths to be scanned. If no_start_optimize is not used, there is +an immediate "no match", without any callouts, because the starting +optimization fails to find "b" in the subject, which it knows must be present +for any match. If no_auto_possess is not used, the "a+" item is turned +into "a++", which reduces the number of backtracks. + +
+The callout_extra modifier has no effect if used with the DFA matching +function, or with JIT. +
++The default return from the callout function is zero, which allows matching to +continue. The callout_fail modifier can be given one or two numbers. If +there is only one number, 1 is returned instead of 0 (causing matching to +backtrack) when a callout of that number is reached. If two numbers (<n>:<m>) +are given, 1 is returned when callout <n> is reached and there have been at +least <m> callouts. The callout_error modifier is similar, except that +PCRE2_ERROR_CALLOUT is returned, causing the entire matching process to be +aborted. If both these modifiers are set for the same callout number, +callout_error takes precedence. Note that callouts with string arguments +are always given the number zero. +
++The callout_data modifier can be given an unsigned or a negative number. +This is set as the "user data" that is passed to the matching function, and +passed back when the callout function is invoked. Any value other than zero is +used as a return from pcre2test's callout function. +
++Inserting callouts can be helpful when using pcre2test to check +complicated regular expressions. For further information about callouts, see +the +pcre2callout +documentation. +
++When pcre2test is outputting text in the compiled version of a pattern, +bytes other than 32-126 are always treated as non-printing characters and are +therefore shown as hex escapes. +
++When pcre2test is outputting text that is a matched part of a subject +string, it behaves in the same way, unless a different locale has been set for +the pattern (using the locale modifier). In this case, the +isprint() function is used to distinguish printing and non-printing +characters. +
++It is possible to save compiled patterns on disc or elsewhere, and reload them +later, subject to a number of restrictions. JIT data cannot be saved. The host +on which the patterns are reloaded must be running the same version of PCRE2, +with the same code unit width, and must also have the same endianness, pointer +width and PCRE2_SIZE type. Before compiled patterns can be saved they must be +serialized, that is, converted to a stream of bytes. A single byte stream may +contain any number of compiled patterns, but they must all use the same +character tables. A single copy of the tables is included in the byte stream +(its size is 1088 bytes). +
++The functions whose names begin with pcre2_serialize_ are used +for serializing and de-serializing. They are described in the +pcre2serialize +documentation. In this section we describe the features of pcre2test that +can be used to test these functions. +
++Note that "serialization" in PCRE2 does not convert compiled patterns to an +abstract format like Java or .NET. It just makes a reloadable byte code stream. +Hence the restrictions on reloading mentioned above. +
++In pcre2test, when a pattern with push modifier is successfully +compiled, it is pushed onto a stack of compiled patterns, and pcre2test +expects the next line to contain a new pattern (or command) instead of a +subject line. By contrast, the pushcopy modifier causes a copy of the +compiled pattern to be stacked, leaving the original available for immediate +matching. By using push and/or pushcopy, a number of patterns can +be compiled and retained. These modifiers are incompatible with posix, +and control modifiers that act at match time are ignored (with a message) for +the stacked patterns. The jitverify modifier applies only at compile +time. +
++The command +
+ #save <filename> ++causes all the stacked patterns to be serialized and the result written to the +named file. Afterwards, all the stacked patterns are freed. The command +
+ #load <filename> ++reads the data in the file, and then arranges for it to be de-serialized, with +the resulting compiled patterns added to the pattern stack. The pattern on the +top of the stack can be retrieved by the #pop command, which must be followed +by lines of subjects that are to be matched with the pattern, terminated as +usual by an empty line or end of file. This command may be followed by a +modifier list containing only +control modifiers +that act after a pattern has been compiled. In particular, hex, +posix, posix_nosub, push, and pushcopy are not allowed, +nor are any +option-setting modifiers. +The JIT modifiers are, however permitted. Here is an example that saves and +reloads two patterns. +
+ /abc/push + /xyz/push + #save tempfile + #load tempfile + #pop info + xyz + + #pop jit,bincode + abc ++If jitverify is used with #pop, it does not automatically imply +jit, which is different behaviour from when it is used on a pattern. + +
+The #popcopy command is analogous to the pushcopy modifier in that it +makes current a copy of the topmost stack pattern, leaving the original still +on the stack. +
++pcre2(3), pcre2api(3), pcre2callout(3), +pcre2jit, pcre2matching(3), pcre2partial(d), +pcre2pattern(3), pcre2serialize(3). +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 24 April 2024
+
+Copyright © 1997-2024 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2unicode.html b/doc/html/pcre2unicode.html new file mode 100644 index 0000000..6f0972e --- /dev/null +++ b/doc/html/pcre2unicode.html @@ -0,0 +1,522 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+UNICODE AND UTF SUPPORT
+
+
+PCRE2 is normally built with Unicode support, though if you do not need it, you +can build it without, in which case the library will be smaller. With Unicode +support, PCRE2 has knowledge of Unicode character properties and can process +strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit +width), but this is not the default. Unless specifically requested, PCRE2 +treats each code unit in a string as one character. +
++There are two ways of telling PCRE2 to switch to UTF mode, where characters may +consist of more than one code unit and the range of values is constrained. The +program can call +pcre2_compile() +with the PCRE2_UTF option, or the pattern may start with the sequence (*UTF). +However, the latter facility can be locked out by the PCRE2_NEVER_UTF option. +That is, the programmer can prevent the supplier of the pattern from switching +to UTF mode. +
++Note that the PCRE2_MATCH_INVALID_UTF option (see +below) +forces PCRE2_UTF to be set. +
++In UTF mode, both the pattern and any subject strings that are matched against +it are treated as UTF strings instead of strings of individual one-code-unit +characters. There are also some other changes to the way characters are +handled, as documented below. +
++When PCRE2 is built with Unicode support, the escape sequences \p{..}, +\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting. +The Unicode properties that can be tested are a subset of those that Perl +supports. Currently they are limited to the general category properties such as +Lu for an upper case letter or Nd for a decimal number, the derived properties +Any and LC (synonym L&), the Unicode script names such as Arabic or Han, +Bidi_Class, Bidi_Control, and a few binary properties. +
++The full lists are given in the +pcre2pattern +and +pcre2syntax +documentation. In general, only the short names for properties are supported. +For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not +supported. Furthermore, in Perl, many properties may optionally be prefixed by +"Is", for compatibility with Perl 5.6. PCRE2 does not support this. +
++Code points less than 256 can be specified in patterns by either braced or +unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). Larger +values have to use braced sequences. Unbraced octal code points up to \777 are +also recognized; larger ones can be coded using \o{...}. +
++The escape sequence \N{U+<hex digits>} is recognized as another way of +specifying a Unicode character by code point in a UTF mode. It is not allowed +in non-UTF mode. +
++In UTF mode, repeat quantifiers apply to complete UTF characters, not to +individual code units. +
++In UTF mode, the dot metacharacter matches one UTF character instead of a +single code unit. +
++In UTF mode, capture group names are not restricted to ASCII, and may contain +any Unicode letters and decimal digits, as well as underscore. +
++The escape sequence \C can be used to match a single code unit in UTF mode, +but its use can lead to some strange effects because it breaks up multi-unit +characters (see the description of \C in the +pcre2pattern +documentation). For this reason, there is a build-time option that disables +support for \C completely. There is also a less draconian compile-time option +for locking out the use of \C when a pattern is compiled. +
++The use of \C is not supported by the alternative matching function +pcre2_dfa_match() when in UTF-8 or UTF-16 mode, that is, when a character +may consist of more than one code unit. The use of \C in these modes provokes +a match-time error. Also, the JIT optimization does not support \C in these +modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that +contains \C, it will not succeed, and so when pcre2_match() is called, +the matching will be carried out by the interpretive function. +
++The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test +characters of any code value, but, by default, the characters that PCRE2 +recognizes as digits, spaces, or word characters remain the same set as in +non-UTF mode, all with code points less than 256. This remains true even when +PCRE2 is built to include Unicode support, because to do otherwise would slow +down matching in many common cases. Note that this also applies to \b +and \B, because they are defined in terms of \w and \W. If you want +to test for a wider sense of, say, "digit", you can use explicit Unicode +property tests such as \p{Nd}. Alternatively, if you set the PCRE2_UCP option, +the way that the character escapes work is changed so that Unicode properties +are used to determine which characters match, though there are some options +that suppress this for individual escapes. For details see the section on +generic character types +in the +pcre2pattern +documentation. +
++Like the escapes, characters that match the POSIX named character classes are +all low-valued characters unless the PCRE2_UCP option is set, but there is an +option to override this. +
++In contrast to the character escapes and character classes, the special +horizontal and vertical white space escapes (\h, \H, \v, and \V) do match +all the appropriate Unicode characters, whether or not PCRE2_UCP is set. +
++If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use +of Unicode properties except for characters whose code points are less than 128 +and that have at most two case-equivalent values. For these, a direct table +lookup is used for speed. A few Unicode characters such as Greek sigma have +more than two code points that are case-equivalent, and these are treated +specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case +processing for non-UTF character encodings such as UCS-2. +
++There are two ASCII characters (S and K) that, in addition to their ASCII lower +case equivalents, have a non-ASCII one as well (long S and Kelvin sign). +Recognition of these non-ASCII characters as case-equivalent to their ASCII +counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT +option. When this is set, all characters in a case equivalence must either be +ASCII or non-ASCII; there can be no mixing. +
++The pattern constructs (*script_run:...) and (*atomic_script_run:...), with +synonyms (*sr:...) and (*asr:...), verify that the string matched within the +parentheses is a script run. In concept, a script run is a sequence of +characters that are all from the same Unicode script. However, because some +scripts are commonly used together, and because some diacritical and other +marks are used with multiple scripts, it is not that simple. +
++Every Unicode character has a Script property, mostly with a value +corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There +are also three special values: +
++"Unknown" is used for code points that have not been assigned, and also for the +surrogate code points. In the PCRE2 32-bit library, characters whose code +points are greater than the Unicode maximum (U+10FFFF), which are accessible +only in non-UTF mode, are assigned the Unknown script. +
++"Common" is used for characters that are used with many scripts. These include +punctuation, emoji, mathematical, musical, and currency symbols, and the ASCII +digits 0 to 9. +
++"Inherited" is used for characters such as diacritical marks that modify a +previous character. These are considered to take on the script of the character +that they modify. +
++Some Inherited characters are used with many scripts, but many of them are only +normally used with a small number of scripts. For example, U+102E0 (Coptic +Epact thousands mark) is used only with Arabic and Coptic. In order to make it +possible to check this, a Unicode property called Script Extension exists. Its +value is a list of scripts that apply to the character. For the majority of +characters, the list contains just one script, the same one as the Script +property. However, for characters such as U+102E0 more than one Script is +listed. There are also some Common characters that have a single, non-Common +script in their Script Extension list. +
++The next section describes the basic rules for deciding whether a given string +of characters is a script run. Note, however, that there are some special cases +involving the Chinese Han script, and an additional constraint for decimal +digits. These are covered in subsequent sections. +
++A string that is less than two characters long is a script run. This is the +only case in which an Unknown character can be part of a script run. Longer +strings are checked using only the Script Extensions property, not the basic +Script property. +
++If a character's Script Extension property is the single value "Inherited", it +is always accepted as part of a script run. This is also true for the property +"Common", subject to the checking of decimal digits described below. All the +remaining characters in a script run must have at least one script in common in +their Script Extension lists. In set-theoretic terminology, the intersection of +all the sets of scripts must not be empty. +
++A simple example is an Internet name such as "google.com". The letters are all +in the Latin script, and the dot is Common, so this string is a script run. +However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a +string that looks the same, but with Cyrillic "o"s is not a script run. +
++More interesting examples involve characters with more than one script in their +Script Extension. Consider the following characters: +
+ U+060C Arabic comma + U+06D4 Arabic full stop ++The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and +Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could +appear in script runs of either Arabic or Hanifi Rohingya. The first could also +appear in Syriac or Thaana script runs, but the second could not. + +
+The Chinese Han script is commonly used in conjunction with other scripts for +writing certain languages. Japanese uses the Hiragana and Katakana scripts +together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo +and Han. These three combinations are treated as special cases when checking +script runs and are, in effect, "virtual scripts". Thus, a script run may +contain a mixture of Hiragana, Katakana, and Han, or a mixture of Hangul and +Han, or a mixture of Bopomofo and Han, but not, for example, a mixture of +Hangul and Bopomofo and Han. PCRE2 (like Perl) follows Unicode's Technical +Standard 39 ("Unicode Security Mechanisms", http://unicode.org/reports/tr39/) +in allowing such mixtures. +
++Unicode contains many sets of 10 decimal digits in different scripts, and some +scripts (including the Common script) contain more than one set. Some of these +decimal digits them are visually indistinguishable from the common ASCII +digits. In addition to the script checking described above, if a script run +contains any decimal digits, they must all come from the same set of 10 +adjacent characters. +
++When the PCRE2_UTF option is set, the strings passed as patterns and subjects +are (by default) checked for validity on entry to the relevant functions. If an +invalid UTF string is passed, a negative error code is returned. The code unit +offset to the offending character can be extracted from the match data block by +calling pcre2_get_startchar(), which is used for this purpose after a UTF +error. +
++In some situations, you may already know that your strings are valid, and +therefore want to skip these checks in order to improve performance, for +example in the case of a long subject string that is being scanned repeatedly. +If you set the PCRE2_NO_UTF_CHECK option at compile time or at match time, +PCRE2 assumes that the pattern or subject it is given (respectively) contains +only valid UTF code unit sequences. +
++If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result +is undefined and your program may crash or loop indefinitely or give incorrect +results. There is, however, one mode of matching that can handle invalid UTF +subject strings. This is enabled by passing PCRE2_MATCH_INVALID_UTF to +pcre2_compile() and is discussed below in the next section. The rest of +this section covers the case when PCRE2_MATCH_INVALID_UTF is not set. +
++Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the UTF check +for the pattern; it does not also apply to subject strings. If you want to +disable the check for a subject string you must pass this same option to +pcre2_match() or pcre2_dfa_match(). +
++UTF-16 and UTF-32 strings can indicate their endianness by special code knows +as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting +strings to be in host byte order. +
++Unless PCRE2_NO_UTF_CHECK is set, a UTF string is checked before any other +processing takes place. In the case of pcre2_match() and +pcre2_dfa_match() calls with a non-zero starting offset, the check is +applied only to that part of the subject that could be inspected during +matching, and there is a check that the starting offset points to the first +code unit of a character or to the end of the subject. If there are no +lookbehind assertions in the pattern, the check starts at the starting offset. +Otherwise, it starts at the length of the longest lookbehind before the +starting offset, or at the start of the subject if there are not that many +characters before the starting offset. Note that the sequences \b and \B are +one-character lookbehinds. +
++In addition to checking the format of the string, there is a check to ensure +that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate +area. The so-called "non-character" code points are not excluded because +Unicode corrigendum #9 makes it clear that they should not be. +
++Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16, +where they are used in pairs to encode code points with values greater than +0xFFFF. The code points that are encoded by UTF-16 pairs are available +independently in the UTF-8 and UTF-32 encodings. (In other words, the whole +surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and +UTF-32.) +
++Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is +given if an escape sequence for an invalid Unicode code point is encountered in +the pattern. If you want to allow escape sequences such as \x{d800} (a +surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra +option. However, this is possible only in UTF-8 and UTF-32 modes, because these +values are not representable in UTF-16. +
++The following negative error codes are given for invalid UTF-8 strings: +
+ PCRE2_ERROR_UTF8_ERR1 + PCRE2_ERROR_UTF8_ERR2 + PCRE2_ERROR_UTF8_ERR3 + PCRE2_ERROR_UTF8_ERR4 + PCRE2_ERROR_UTF8_ERR5 ++The string ends with a truncated UTF-8 character; the code specifies how many +bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be +no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279) +allows for up to 6 bytes, and this is checked first; hence the possibility of +4 or 5 missing bytes. +
+ PCRE2_ERROR_UTF8_ERR6 + PCRE2_ERROR_UTF8_ERR7 + PCRE2_ERROR_UTF8_ERR8 + PCRE2_ERROR_UTF8_ERR9 + PCRE2_ERROR_UTF8_ERR10 ++The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the +character do not have the binary value 0b10 (that is, either the most +significant bit is 0, or the next bit is 1). +
+ PCRE2_ERROR_UTF8_ERR11 + PCRE2_ERROR_UTF8_ERR12 ++A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long; +these code points are excluded by RFC 3629. +
+ PCRE2_ERROR_UTF8_ERR13 ++A 4-byte character has a value greater than 0x10ffff; these code points are +excluded by RFC 3629. +
+ PCRE2_ERROR_UTF8_ERR14 ++A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of +code points are reserved by RFC 3629 for use with UTF-16, and so are excluded +from UTF-8. +
+ PCRE2_ERROR_UTF8_ERR15 + PCRE2_ERROR_UTF8_ERR16 + PCRE2_ERROR_UTF8_ERR17 + PCRE2_ERROR_UTF8_ERR18 + PCRE2_ERROR_UTF8_ERR19 ++A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a +value that can be represented by fewer bytes, which is invalid. For example, +the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just +one byte. +
+ PCRE2_ERROR_UTF8_ERR20 ++The two most significant bits of the first byte of a character have the binary +value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a +byte can only validly occur as the second or subsequent byte of a multi-byte +character. +
+ PCRE2_ERROR_UTF8_ERR21 ++The first byte of a character has the value 0xfe or 0xff. These values can +never occur in a valid UTF-8 string. + +
+The following negative error codes are given for invalid UTF-16 strings: +
+ PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at end of string + PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate follows high surrogate + PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate + ++ +
+The following negative error codes are given for invalid UTF-32 strings: +
+ PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff) + PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff + ++ +
+You can run pattern matches on subject strings that may contain invalid UTF +sequences if you call pcre2_compile() with the PCRE2_MATCH_INVALID_UTF +option. This is supported by pcre2_match(), including JIT matching, but +not by pcre2_dfa_match(). When PCRE2_MATCH_INVALID_UTF is set, it forces +PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a +valid UTF string. +
++If you do not set PCRE2_MATCH_INVALID_UTF when calling pcre2_compile, and +you are not certain that your subject strings are valid UTF sequences, you +should not make use of the JIT "fast path" function pcre2_jit_match() +because it bypasses sanity checks, including the one for UTF validity. An +invalid string may cause undefined behaviour, including looping, crashing, or +giving the wrong answer. +
++Setting PCRE2_MATCH_INVALID_UTF does not affect what pcre2_compile() +generates, but if pcre2_jit_compile() is subsequently called, it does +generate different code. If JIT is not used, the option affects the behaviour +of the interpretive code in pcre2_match(). When PCRE2_MATCH_INVALID_UTF +is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time. +
++In this mode, an invalid code unit sequence in the subject never matches any +pattern item. It does not match dot, it does not match \p{Any}, it does not +even match negative items such as [^X]. A lookbehind assertion fails if it +encounters an invalid sequence while moving the current point backwards. In +other words, an invalid UTF code unit sequence acts as a barrier which no match +can cross. +
++You can also think of this as the subject being split up into fragments of +valid UTF, delimited internally by invalid code unit sequences. The pattern is +matched fragment by fragment. The result of a successful match, however, is +given as code unit offsets in the entire subject string in the usual way. There +are a few points to consider: +
++The internal boundaries are not interpreted as the beginnings or ends of lines +and so do not match circumflex or dollar characters in the pattern. +
++If pcre2_match() is called with an offset that points to an invalid +UTF-sequence, that sequence is skipped, and the match starts at the next valid +UTF character, or the end of the subject. +
++At internal fragment boundaries, \b and \B behave in the same way as at the +beginning and end of the subject. For example, a sequence such as \bWORD\b +would match an instance of WORD that is surrounded by invalid UTF code units. +
++Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbitrary +data, knowing that any matched strings that are returned are valid UTF. This +can be useful when searching for UTF text in executable or other binary files. +
++Note, however, that the 16-bit and 32-bit PCRE2 libraries process strings as +sequences of uint16_t or uint32_t code points. They cannot find valid UTF +sequences within an arbitrary string of bytes unless such sequences are +suitably aligned. +
+
+Philip Hazel
+
+Retired from University Computing Service
+
+Cambridge, England.
+
+
+Last updated: 12 October 2023
+
+Copyright © 1997-2023 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/index.html.src b/doc/index.html.src new file mode 100644 index 0000000..e4dc786 --- /dev/null +++ b/doc/index.html.src @@ -0,0 +1,318 @@ + + + ++The HTML documentation for PCRE2 consists of a number of pages that are listed +below in alphabetical order. If you are new to PCRE2, please read the first one +first. +
+ +| pcre2 | +Introductory page |
| pcre2-config | +Information about the installation configuration |
| pcre2api | +PCRE2's native API |
| pcre2build | +Building PCRE2 |
| pcre2callout | +The callout facility |
| pcre2compat | +Compability with Perl |
| pcre2convert | +Experimental foreign pattern conversion functions |
| pcre2demo | +A demonstration C program that uses the PCRE2 library |
| pcre2grep | +The pcre2grep command |
| pcre2jit | +Discussion of the just-in-time optimization support |
| pcre2limits | +Details of size and other limits |
| pcre2matching | +Discussion of the two matching algorithms |
| pcre2partial | +Using PCRE2 for partial matching |
| pcre2pattern | +Specification of the regular expressions supported by PCRE2 |
| pcre2perform | +Some comments on performance |
| pcre2posix | +The POSIX API to the PCRE2 8-bit library |
| pcre2sample | +Discussion of the pcre2demo program |
| pcre2serialize | +Serializing functions for saving precompiled patterns |
| pcre2syntax | +Syntax quick-reference summary |
| pcre2test | +The pcre2test command for testing PCRE2 |
| pcre2unicode | +Discussion of Unicode and UTF-8/UTF-16/UTF-32 support |
+There are also individual pages that summarize the interface for each function +in the library. +
+ +| pcre2_callout_enumerate | +Enumerate callouts in a compiled pattern |
| pcre2_code_copy | +Copy a compiled pattern |
| pcre2_code_copy_with_tables | +Copy a compiled pattern and its character tables |
| pcre2_code_free | +Free a compiled pattern |
| pcre2_compile | +Compile a regular expression pattern |
| pcre2_compile_context_copy | +Copy a compile context |
| pcre2_compile_context_create | +Create a compile context |
| pcre2_compile_context_free | +Free a compile context |
| pcre2_config | +Show build-time configuration options |
| pcre2_convert_context_copy | +Copy a convert context |
| pcre2_convert_context_create | +Create a convert context |
| pcre2_convert_context_free | +Free a convert context |
| pcre2_converted_pattern_free | +Free converted foreign pattern |
| pcre2_dfa_match | +Match a compiled pattern to a subject string + (DFA algorithm; not Perl compatible) |
| pcre2_general_context_copy | +Copy a general context |
| pcre2_general_context_create | +Create a general context |
| pcre2_general_context_free | +Free a general context |
| pcre2_get_error_message | +Get textual error message for error number |
| pcre2_get_mark | +Get a (*MARK) name |
| pcre2_get_match_data_size | +Get the size of a match data block |
| pcre2_get_ovector_count | +Get the ovector count |
| pcre2_get_ovector_pointer | +Get a pointer to the ovector |
| pcre2_get_startchar | +Get the starting character offset |
| pcre2_jit_compile | +Process a compiled pattern with the JIT compiler |
| pcre2_jit_free_unused_memory | +Free unused JIT memory |
| pcre2_jit_match | +Fast path interface to JIT matching |
| pcre2_jit_stack_assign | +Assign stack for JIT matching |
| pcre2_jit_stack_create | +Create a stack for JIT matching |
| pcre2_jit_stack_free | +Free a JIT matching stack |
| pcre2_maketables | +Build character tables in current locale |
| pcre2_maketables_free | +Free character tables |
| pcre2_match | +Match a compiled pattern to a subject string + (Perl compatible) |
| pcre2_match_context_copy | +Copy a match context |
| pcre2_match_context_create | +Create a match context |
| pcre2_match_context_free | +Free a match context |
| pcre2_match_data_create | +Create a match data block |
| pcre2_match_data_create_from_pattern | +Create a match data block getting size from pattern |
| pcre2_match_data_free | +Free a match data block |
| pcre2_pattern_convert | +Experimental foreign pattern converter |
| pcre2_pattern_info | +Extract information about a pattern |
| pcre2_serialize_decode | +Decode serialized compiled patterns |
| pcre2_serialize_encode | +Serialize compiled patterns for save/restore |
| pcre2_serialize_free | +Free serialized compiled patterns |
| pcre2_serialize_get_number_of_codes | +Get number of serialized compiled patterns |
| pcre2_set_bsr | +Set \R convention |
| pcre2_set_callout | +Set up a callout function |
| pcre2_set_character_tables | +Set character tables |
| pcre2_set_compile_extra_options | +Set compile time extra options |
| pcre2_set_compile_recursion_guard | +Set up a compile recursion guard function |
| pcre2_set_depth_limit | +Set the match backtracking depth limit |
| pcre2_set_glob_escape | +Set glob escape character |
| pcre2_set_glob_separator | +Set glob separator character |
| pcre2_set_heap_limit | +Set the match backtracking heap limit |
| pcre2_set_match_limit | +Set the match limit |
| pcre2_set_max_pattern_compiled_length | +Set the maximum length of a compiled pattern |
| pcre2_set_max_pattern_length | +Set the maximum length of a pattern |
| pcre2_set_max_varlookbehind | +Set the maximum match length for a variable-length lookbehind |
| pcre2_set_newline | +Set the newline convention |
| pcre2_set_offset_limit | +Set the offset limit |
| pcre2_set_parens_nest_limit | +Set the parentheses nesting limit |
| pcre2_set_recursion_limit | +Obsolete: use pcre2_set_depth_limit |
| pcre2_set_recursion_memory_management | +Obsolete function that (from 10.30 onwards) does nothing |
| pcre2_substitute | +Match a compiled pattern to a subject string and do + substitutions |
| pcre2_substring_copy_byname | +Extract named substring into given buffer |
| pcre2_substring_copy_bynumber | +Extract numbered substring into given buffer |
| pcre2_substring_free | +Free extracted substring |
| pcre2_substring_get_byname | +Extract named substring into new memory |
| pcre2_substring_get_bynumber | +Extract numbered substring into new memory |
| pcre2_substring_length_byname | +Find length of named substring |
| pcre2_substring_length_bynumber | +Find length of numbered substring |
| pcre2_substring_list_free | +Free list of extracted substrings |
| pcre2_substring_list_get | +Extract all substrings into new memory |
| pcre2_substring_nametable_scan | +Find table entries for given string name |
| pcre2_substring_number_from_name | +Convert captured string name to number |