From 690c8817ceee6e39e6dc5843700c0f6d001341c9 Mon Sep 17 00:00:00 2001
From: Matthew Vernon
\n";
-$inpara = 1;
-}
-
-
-# Main program
-
-$innf = 0;
-$inpara = 0;
-$inpre = 0;
-$wrotetext = 0;
-$toc = 0;
-$ref = 1;
-
-while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
- {
- $toc = 1 if $ARGV[0] eq "-toc";
- shift;
- }
-
-# Initial output to STDOUT
-
-print <
-Return to the PCRE2 index page.
-
-This page is part of the PCRE2 HTML documentation. It was generated
-automatically from the original man page. If there is any nonsense in it,
-please consult the man page, in case the conversion went wrong.
-$ARGV[0] man page
-
-End
-
-print "\n" if ($toc);
-
-open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
-
-while (
\n" if ($toc);
-
-# Copy the remainder to the standard output
-
-close(TEMP);
-open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
-
-print while (
- # and
that delimit literal sections will do the spacing. Always skip
- # if no previous output.
-
- elsif (/^\.sp/)
- {
- if ($wrotetext)
- {
- $_ =
\n
\n" if ($innf || /^\.nf/ || !/^[\s.]/);
- }
- redo; # Now process the lookahead line we just read
- }
- }
- elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
- {
- &new_para();
- }
- elsif (/^\.SH\s*("?)(.*)\1/)
- {
- # Ignore the NAME section
- if ($2 =~ /^NAME\b/)
- {
-
$title
\n",
- $ref);
- $ref++;
- }
- else
- {
- print TEMP "
\n$title\n
\n";
- }
- }
- elsif (/^\.SS\s*("?)(.*)\1/)
- {
- &end_para();
- my($title) = &do_line($2);
- print TEMP "
\n$title\n
\n";
- }
- elsif (/^\.B\s*(.*)/)
- {
- &new_para() if (!$inpara);
- $_ = &do_line($1);
- s/"(.*?)"/$1/g;
- print TEMP "$_\n";
- $wrotetext = 1;
- }
- elsif (/^\.I\s*(.*)/)
- {
- &new_para() if (!$inpara);
- $_ = &do_line($1);
- s/"(.*?)"/$1/g;
- print TEMP "$_\n";
- $wrotetext = 1;
- }
-
- # Remove the "AUTOMATICALLY GENERATED" warning from pcre2demo.3
- elsif (/^\.\\"AUTOMATICALLY GENERATED/) { next; }
-
- # A comment that starts "HREF" takes the next line as a name that
- # is turned into a hyperlink, using the text given, which might be
- # in a special font. If it ends in () or (digits) or punctuation, they
- # aren't part of the link.
-
- elsif (/^\.\\"\s*HREF/)
- {
- $_=\n";
- while (
\n";
- $inpre = 1;
- }
- }
- elsif ($inpre)
- {
- print TEMP "\n";
- $inpre = 0;
- }
-
- # Add
to the end of a non-literal line if we are within .nf/.fi
-
- $_ .= "
\n" if (!$inpre && $innf);
-
- print TEMP;
- $wrotetext = 1;
- }
-
-# The TOC, if present, will have been written - terminate it
-
-print "
| Name | +Role | +
|---|---|
|
+
+ Nicholas Wilson + `nicholas@nicholaswilson.me.uk` + Currently of Microsoft Research Cambridge, UK + + |
+ + + * General project administration & maintenance + * Release management + * Code maintenance + + | +
|
+
+ Zoltán Herczeg + `hzmester@freemail.hu` + Currently of the University of Szeged, Hungary + + |
+ + + * Code maintenance + * Ownership of `sljit` and PCRE2's JIT + + | +
-Philip Hazel
-
-Retired from University Computing Service
-
-Cambridge, England.
-
+The current maintainers of PCRE2 are Nicholas Wilson and Zoltan Herczeg.
+
+PCRE2 was written by Philip Hazel, of the University Computing Service, +Cambridge, England. Many others have also contributed.
-Putting an actual email address here is a spam magnet. If you want to email me, -use my two names separated by a dot at gmail.com. +To contact the maintainers, please use the GitHub issues tracker or PCRE2 +mailing list, as described at the project page: +https://github.com/PCRE2Project/pcre2
-Last updated: 27 August 2021
+Last updated: 18 December 2024
Copyright © 1997-2021 University of Cambridge.
diff --git a/doc/html/pcre2_compile.html b/doc/html/pcre2_compile.html
index f0080ea..ee933f3 100644
--- a/doc/html/pcre2_compile.html
+++ b/doc/html/pcre2_compile.html
@@ -57,6 +57,7 @@ The primary option bits are:
PCRE2_ALLOW_EMPTY_CLASS Allow empty classes
PCRE2_ALT_BSUX Alternative handling of \u, \U, and \x
PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode
+ PCRE2_ALT_EXTENDED_CLASS Alternative extended character class syntax
PCRE2_ALT_VERBNAMES Process backslashes in verb names
PCRE2_AUTO_CALLOUT Compile automatic callouts
PCRE2_CASELESS Do caseless matching
diff --git a/doc/html/pcre2_jit_compile.html b/doc/html/pcre2_jit_compile.html
index 873d0dd..791dd0c 100644
--- a/doc/html/pcre2_jit_compile.html
+++ b/doc/html/pcre2_jit_compile.html
@@ -33,9 +33,18 @@ details are given in the
documentation.
-The first argument is a pointer that was returned by a successful call to -pcre2_compile(), and the second must contain one or more of the following -bits: +The availability of JIT support can be tested by calling +pcre2_compile_jit() with a single option PCRE2_JIT_TEST_ALLOC (the +code argument is ignored, so a NULL value is accepted). Such a call +returns zero if JIT is available and has a working allocator. Otherwise +it returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate +executable memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not +compiled. +
++Otherwise, the first argument must be a pointer that was returned by a +successful call to pcre2_compile(), and the second must contain one or +more of the following bits:
PCRE2_JIT_COMPLETE compile code for full matching PCRE2_JIT_PARTIAL_SOFT compile code for soft partial matching @@ -46,11 +55,13 @@ superseded by the pcre2_compile() option PCRE2_MATCH_INVALID_UTF. The old option is deprecated and may be removed in the future.There is a complete description of the PCRE2 native API in the pcre2api diff --git a/doc/html/pcre2_set_max_pattern_compiled_length.html b/doc/html/pcre2_set_max_pattern_compiled_length.html index ab570cf..a40f41e 100644 --- a/doc/html/pcre2_set_max_pattern_compiled_length.html +++ b/doc/html/pcre2_set_max_pattern_compiled_length.html @@ -27,9 +27,9 @@ DESCRIPTION-The yield of the function is 0 for success, or a negative error code otherwise. -In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or -if an unknown bit is set in options. The function can also return -PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the -compiler, even if it was because of a system security restriction. +The yield of the function when called with any of the three options above is 0 +for success, or a negative error code otherwise. In particular, +PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or if an unknown +bit is set in options. The function can also return PCRE2_ERROR_NOMEMORY +if JIT is unable to allocate executable memory for the compiler, even if it was +because of a system security restriction. In a few cases, the function may +return with PCRE2_ERROR_JIT_UNSUPPORTED for unsupported features.
There is a complete description of the PCRE2 native API in the diff --git a/doc/html/pcre2_set_compile_extra_options.html b/doc/html/pcre2_set_compile_extra_options.html index 4924ed7..cb62022 100644 --- a/doc/html/pcre2_set_compile_extra_options.html +++ b/doc/html/pcre2_set_compile_extra_options.html @@ -43,6 +43,10 @@ options are: PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines PCRE2_EXTRA_MATCH_WORD Pattern matches "words" + PCRE2_EXTRA_NEVER_CALLOUT Disallow callouts in pattern + PCRE2_EXTRA_NO_BS0 Disallow \0 (but not \00 or \000) + PCRE2_EXTRA_PYTHON_OCTAL Use Python rules for octal + PCRE2_EXTRA_TURKISH_CASING Use Turkish I case folding
This function sets, in a compile context, the maximum size (in bytes) for the -memory needed to hold the compiled version of a pattern that is compiled with -this context. The result is always zero. If a pattern that is passed to -pcre2_compile() with this context needs more memory, an error is +memory needed to hold the compiled version of a pattern that is using this +context. The result is always zero. If a pattern that is passed to +pcre2_compile() referencing this context needs more memory, an error is generated. The default is the largest number that a PCRE2_SIZE variable can hold, which is effectively unlimited.
diff --git a/doc/html/pcre2_set_optimize.html b/doc/html/pcre2_set_optimize.html new file mode 100644 index 0000000..47caeb2 --- /dev/null +++ b/doc/html/pcre2_set_optimize.html @@ -0,0 +1,57 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_optimize(pcre2_compile_context *ccontext, + uint32_t directive); +
++This function controls which performance optimizations will be applied +by pcre2_compile(). It can be called multiple times with the same compile +context; the effects are cumulative, with the effects of later calls taking +precedence over earlier ones. +
++The result is zero for success, PCRE2_ERROR_NULL if ccontext is NULL, +or PCRE2_ERROR_BADOPTION if directive is unknown. The latter could be +useful to detect if a certain optimization is available. +
++The list of possible values for the directive parameter are: +
+ PCRE2_OPTIMIZATION_FULL Enable all optimizations (default) + PCRE2_OPTIMIZATION_NONE Disable all optimizations + PCRE2_AUTO_POSSESS Enable auto-possessification + PCRE2_AUTO_POSSESS_OFF Disable auto-possessification + PCRE2_DOTSTAR_ANCHOR Enable implicit dotstar anchoring + PCRE2_DOTSTAR_ANCHOR_OFF Disable implicit dotstar anchoring + PCRE2_START_OPTIMIZE Enable start-up optimizations at match time + PCRE2_START_OPTIMIZE_OFF Disable start-up optimizations at match time ++There is a complete description of the PCRE2 native API, including detailed +descriptions directive parameter values in the +pcre2api +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2_set_substitute_callout.html b/doc/html/pcre2_set_substitute_callout.html index 7ae3a39..8640728 100644 --- a/doc/html/pcre2_set_substitute_callout.html +++ b/doc/html/pcre2_set_substitute_callout.html @@ -20,7 +20,7 @@ SYNOPSISint pcre2_set_substitute_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_substitute_callout_block *), + int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data);
+Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+SYNOPSIS
+
+
+#include <pcre2.h> +
++int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, + PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, + PCRE2_UCHAR *, PCRE2_SIZE, + int, void *), + void *callout_data); +
++This function sets the substitute case callout fields in a match context (the +first argument). The second argument specifies a callout function, and the third +argument is an opaque data item that is passed to it. The result of this +function is always zero. +
++There is a complete description of the PCRE2 native API in the +pcre2api +page and a description of the POSIX API in the +pcre2posix +page. +
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 6b60ee9..079cf17 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -179,6 +179,10 @@ document for an overview of all the PCRE2 documentation.
@@ -203,6 +207,13 @@ document for an overview of all the PCRE2 documentation.
void *callout_data);
+int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext,
+ PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE,
+ PCRE2_UCHAR *, PCRE2_SIZE,
+ int, void *),
+ void *callout_data);
+
+
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
PCRE2_SIZE value);
@@ -808,6 +819,7 @@ following compile-time parameters:
The compile time nested parentheses limit
The maximum length of the pattern string
The extra options bits (none set by default)
+ Which performance optimizations the compiler should apply
A compile context is also required if you are using custom memory management.
If none of these apply, just pass NULL as the context argument of
@@ -952,6 +964,110 @@ The first argument to the callout function gives the current depth of
nesting, and the second is user data that is set up by the last argument of
pcre2_set_compile_recursion_guard(). The callout function should return
zero if all is well, or non-zero to force an error.
+
+
+int pcre2_set_optimize(pcre2_compile_context *ccontext,
+ uint32_t directive);
+
+
+PCRE2 can apply various performance optimizations during compilation, in order
+to make matching faster. For example, the compiler might convert some regex
+constructs into an equivalent construct which pcre2_match() can execute
+faster. By default, all available optimizations are enabled. However, in rare
+cases, one might wish to disable specific optimizations. For example, if it is
+known that some optimizations cannot benefit a certain regex, it might be
+desirable to disable them, in order to speed up compilation.
+
+The permitted values of directive are as follows: +
+ PCRE2_OPTIMIZATION_FULL ++Enable all optional performance optimizations. This is the default value. +
+ PCRE2_OPTIMIZATION_NONE ++Disable all optional performance optimizations. +
+ PCRE2_AUTO_POSSESS + PCRE2_AUTO_POSSESS_OFF ++Enable/disable "auto-possessification" of variable quantifiers such as * and +. +This optimization, for example, turns a+b into a++b in order to avoid +backtracks into a+ that can never be successful. However, if callouts are in +use, auto-possessification means that some callouts are never taken. You can +disable this optimization if you want the matching functions to do a full, +unoptimized search and run all the callouts. +
+ PCRE2_DOTSTAR_ANCHOR + PCRE2_DOTSTAR_ANCHOR_OFF ++Enable/disable an optimization that is applied when .* is the first significant +item in a top-level branch of a pattern, and all the other branches also start +with .* or with \A or \G or ^. Such a pattern is automatically anchored if +PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set for any +^ items. Otherwise, the fact that any match must start either at the start of +the subject or following a newline is remembered. Like other optimizations, +this can cause callouts to be skipped. + +
+Dotstar anchor optimization is automatically disabled for .* if it is inside an +atomic group or a capture group that is the subject of a backreference, or if +the pattern contains (*PRUNE) or (*SKIP). +
+ PCRE2_START_OPTIMIZE + PCRE2_START_OPTIMIZE_OFF ++Enable/disable optimizations which cause matching functions to scan the subject +string for specific code unit values before attempting a match. For example, if +it is known that an unanchored match must start with a specific value, the +matching code searches the subject for that value, and fails immediately if it +cannot find it, without actually running the main matching function. This means +that a special item such as (*COMMIT) at the start of a pattern is not +considered until after a suitable starting point for the match has been found. +Also, when callouts or (*MARK) items are in use, these "start-up" optimizations +can cause them to be skipped if the pattern is never actually used. The start-up +optimizations are in effect a pre-scan of the subject that takes place before +the pattern is run. + +
+Disabling start-up optimizations ensures that in cases where the result is "no +match", the callouts do occur, and that items such as (*COMMIT) and (*MARK) are +considered at every possible starting position in the subject string. +
++Disabling start-up optimizations may change the outcome of a matching operation. +Consider the pattern +
+ (*COMMIT)ABC ++When this is compiled, PCRE2 records the fact that a match must start with the +character "A". Suppose the subject string is "DEFABC". The start-up +optimization scans along the subject, finds "A" and runs the first match +attempt from there. The (*COMMIT) item means that the pattern must match the +current starting position, which in this case, it does. However, if the same +match is run without start-up optimizations, the initial scan along the subject +string does not happen. The first match attempt is run starting from "D" and +when this fails, (*COMMIT) prevents any further matches being tried, so the +overall result is "no match". + +
+Another start-up optimization makes use of a minimum length for a matching +subject, which is recorded when possible. Consider the pattern +
+ (*MARK:1)B(*MARK:2)(X|Y) ++The minimum length for a match is two characters. If the subject is "XXBB", the +"starting character" optimization skips "XX", then tries to match "BB", which +is long enough. In the process, (*MARK:2) is encountered and remembered. When +the match attempt fails, the next "B" is found, but there is only one character +left, so there are no more attempts, and "no match" is returned with the "last +mark seen" set to "2". Without start-up optimizations, however, matches are +tried at every possible starting position, including at the end of the subject, +where (*MARK:1) is encountered, but there is no "B", so the "last mark seen" +that is returned is "1". In this case, the optimizations do not affect the +overall match result, which is still "no match", but they do affect the +auxiliary information that is returned.
@@ -1431,7 +1563,7 @@ respectively, when pcre2_compile() returns NULL because a compilation error has occurred.For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for PCRE2_INFO_ALLOPTIONS. @@ -3646,9 +3812,10 @@ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (without, of course, writing anything) -in order to compute the size of buffer that is needed. This value is passed -back via the outlengthptr variable, with the result of the function still -being PCRE2_ERROR_NOMEMORY. +in order to compute the size of buffer that is needed, which will include the +extra space for the terminating NUL. This value is passed back via the +outlengthptr variable, with the result of the function still being +PCRE2_ERROR_NOMEMORY.-There are nearly 100 positive error codes that pcre2_compile() may return +There are over 100 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that are used for invalid UTF strings when validity checking is in force. These are the same as given by pcre2_match() and pcre2_dfa_match(), and @@ -1539,6 +1671,16 @@ after any internal newline. However, it does not match after a newline at the end of the subject, for compatibility with Perl. If you want a multiline circumflex also to match after a terminating newline, you must set PCRE2_ALT_CIRCUMFLEX. +
+ PCRE2_ALT_EXTENDED_CLASS ++Alters the parsing of character classes to follow the extended syntax +described by Unicode UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact +on the behaviour of the Perl-specific "(?[...])" syntax for extended classes, +but instead enables the alternative syntax of extended class behaviour inside +ordinary "[...]" character classes. See the +pcre2pattern +documentation for details of the character classes supported.PCRE2_ALT_VERBNAMES@@ -1569,16 +1711,31 @@ letters in the subject. It is equivalent to Perl's /i option, and it can be changed within a pattern by a (?i) option setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode properties are used for all characters with more than one other case, and for all characters whose code points are greater than -U+007F. Note that there are two ASCII characters, K and S, that, in addition to +U+007F. + ++Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and U+017F (long S) respectively. If you do not want this case equivalence, you can suppress it by setting PCRE2_EXTRA_CASELESS_RESTRICT.
+One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +
+For lower valued characters with only one other case, a lookup table is used for speed. When neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for all code points less than 256, and higher code points (available only in 16-bit or 32-bit mode) are treated as not having another case. +
++From release 10.45 PCRE2_CASELESS also affects what some of the letter-related +Unicode property escapes (\p and \P) match. The properties Lu (upper case +letter), Ll (lower case letter), and Lt (title case letter) are all treated as +LC (cased letter) when PCRE2_CASELESS is set.
PCRE2_DOLLAR_ENDONLY@@ -1775,7 +1932,7 @@ This option locks out the use of Unicode properties for handling \B, \b, \D, for the PCRE2_UCP option below. In particular, it prevents the creator of the pattern from enabling this facility by starting the pattern with (*UCP). This option may be useful in applications that process patterns from external -sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes an error. +sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error.PCRE2_NEVER_UTF@@ -1798,85 +1955,57 @@ though the reference can be by name or by number.PCRE2_NO_AUTO_POSSESS-If this option is set, it disables "auto-possessification", which is an -optimization that, for example, turns a+b into a++b in order to avoid +If this (deprecated) option is set, it disables "auto-possessification", which +is an optimization that, for example, turns a+b into a++b in order to avoid backtracks into a+ that can never be successful. However, if callouts are in use, auto-possessification means that some callouts are never taken. You can set this option if you want the matching functions to do a full unoptimized search and run all the callouts, but it is mainly provided for testing purposes. + ++If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_AUTO_POSSESS_OFF rather +than the compile option PCRE2_NO_AUTO_POSSESS. Note that PCRE2_NO_AUTO_POSSESS +takes precedence over the pcre2_set_optimize() optimization directives +PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF.
PCRE2_NO_DOTSTAR_ANCHOR-If this option is set, it disables an optimization that is applied when .* is -the first significant item in a top-level branch of a pattern, and all the -other branches also start with .* or with \A or \G or ^. The optimization is -automatically disabled for .* if it is inside an atomic group or a capture -group that is the subject of a backreference, or if the pattern contains -(*PRUNE) or (*SKIP). When the optimization is not disabled, such a pattern is -automatically anchored if PCRE2_DOTALL is set for all the .* items and -PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any match -must start either at the start of the subject or following a newline is +If this (deprecated) option is set, it disables an optimization that is applied +when .* is the first significant item in a top-level branch of a pattern, and +all the other branches also start with .* or with \A or \G or ^. The +optimization is automatically disabled for .* if it is inside an atomic group +or a capture group that is the subject of a backreference, or if the pattern +contains (*PRUNE) or (*SKIP). When the optimization is not disabled, such a +pattern is automatically anchored if PCRE2_DOTALL is set for all the .* items +and PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that any +match must start either at the start of the subject or following a newline is remembered. Like other optimizations, this can cause callouts to be skipped. +(If a compile context is available, it is recommended to use +pcre2_set_optimize() with the directive PCRE2_DOTSTAR_ANCHOR_OFF +instead.)PCRE2_NO_START_OPTIMIZEThis is an option whose main effect is at matching time. It does not change what pcre2_compile() generates, but it does affect the output of the JIT -compiler. +compiler. Setting this option is equivalent to calling pcre2_set_optimize() +with the directive parameter set to PCRE2_START_OPTIMIZE_OFF.There are a number of optimizations that may occur at the start of a match, in order to speed up the process. For example, if it is known that an unanchored match must start with a specific code unit value, the matching code searches the subject for that value, and fails immediately if it cannot find it, without -actually running the main matching function. This means that a special item -such as (*COMMIT) at the start of a pattern is not considered until after a -suitable starting point for the match has been found. Also, when callouts or -(*MARK) items are in use, these "start-up" optimizations can cause them to be -skipped if the pattern is never actually used. The start-up optimizations are +actually running the main matching function. The start-up optimizations are in effect a pre-scan of the subject that takes place before the pattern is run.
-The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations, -possibly causing performance to suffer, but ensuring that in cases where the -result is "no match", the callouts do occur, and that items such as (*COMMIT) -and (*MARK) are considered at every possible starting position in the subject -string. -
--Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation. -Consider the pattern -
- (*COMMIT)ABC --When this is compiled, PCRE2 records the fact that a match must start with the -character "A". Suppose the subject string is "DEFABC". The start-up -optimization scans along the subject, finds "A" and runs the first match -attempt from there. The (*COMMIT) item means that the pattern must match the -current starting position, which in this case, it does. However, if the same -match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the -subject string does not happen. The first match attempt is run starting from -"D" and when this fails, (*COMMIT) prevents any further matches being tried, so -the overall result is "no match". - --As another start-up optimization makes use of a minimum length for a matching -subject, which is recorded when possible. Consider the pattern -
- (*MARK:1)B(*MARK:2)(X|Y) --The minimum length for a match is two characters. If the subject is "XXBB", the -"starting character" optimization skips "XX", then tries to match "BB", which -is long enough. In the process, (*MARK:2) is encountered and remembered. When -the match attempt fails, the next "B" is found, but there is only one character -left, so there are no more attempts, and "no match" is returned with the "last -mark seen" set to "2". If NO_START_OPTIMIZE is set, however, matches are tried -at every possible starting position, including at the end of the subject, where -(*MARK:1) is encountered, but there is no "B", so the "last mark seen" that is -returned is "1". In this case, the optimizations do not affect the overall -match result, which is still "no match", but they do affect the auxiliary -information that is returned. +Disabling the start-up optimizations may cause performance to suffer. However, +this may be desirable for patterns which contain callouts or items such as +(*COMMIT) and (*MARK). See the above description of PCRE2_START_OPTIMIZE_OFF +for further details.PCRE2_NO_UTF_CHECK@@ -1931,9 +2060,16 @@ The second effect of PCRE2_UCP is to force the use of Unicode properties for upper/lower casing operations, even when PCRE2_UTF is not set. This makes it possible to process strings in the 16-bit UCS-2 code. This option is available only if PCRE2 has been compiled with Unicode support (which is the default). -The PCRE2_EXTRA_CASELESS_RESTRICT option (see below) restricts caseless + ++The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts caseless matching such that ASCII characters match only ASCII characters and non-ASCII -characters match only non-ASCII characters. +characters match only non-ASCII characters. The PCRE2_EXTRA_TURKISH_CASING option +(see above) alters the matching of the 'i' characters to follow their behaviour +in Turkish and Azeri languages. For further details on +PCRE2_EXTRA_CASELESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the +pcre2unicode +page.
PCRE2_UNGREEDY@@ -2070,7 +2206,8 @@ characters. The ASCII letter S is case-equivalent to U+017f (long S) and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). This option disables recognition of case-equivalences that cross the ASCII/non-ASCII boundary. In a caseless match, both characters must either be ASCII or non-ASCII. The option -can be changed with a pattern by the (?r) option setting. +can be changed within a pattern by the (*CASELESS_RESTRICT) or (?r) option +settings.PCRE2_EXTRA_ESCAPED_CR_IS_LF@@ -2097,6 +2234,34 @@ and the end. This is achieved by automatically inserting the code for "\b(?:" at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set. ++ PCRE2_EXTRA_NO_BS0 ++If this option is set (note that its final character is the digit 0) it locks +out the use of the sequence \0 unless at least one more octal digit follows. ++ PCRE2_EXTRA_PYTHON_OCTAL ++If this option is set, PCRE2 follows Python's rules for interpreting octal +escape sequences. The rules for handling sequences such as \14, which could +be an octal number or a back reference are different. Details are given in the +pcre2pattern +documentation. ++ PCRE2_EXTRA_NEVER_CALLOUT ++If this option is set, PCRE2 treats callouts in the pattern as a syntax error, +returning PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the application +knows that a callout will not be provided to pcre2_match(), so that +callouts in the pattern are not silently ignored. ++ PCRE2_EXTRA_TURKISH_CASING ++This option alters case-equivalence of the 'i' letters to follow the +alphabet used by Turkish and Azeri languages. The option can be changed within +a pattern by the (*TURKISH_CASING) start-of-pattern setting. Either the UTF or +UCP options must be set. In the 8-bit library, UTF must be set. This option +cannot be combined with PCRE2_EXTRA_CASELESS_RESTRICT.
JUST-IN-TIME (JIT) COMPILATION
@@ -2303,6 +2468,7 @@ following are true: PCRE2_DOTALL is in force for .* Neither (*PRUNE) nor (*SKIP) appears in the pattern PCRE2_NO_DOTSTAR_ANCHOR is not set + Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_OFF
Passing a buffer size of zero is a permitted way of finding out how much memory @@ -3667,18 +3834,26 @@ If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not interpreted in any way. By default, however, a dollar character is an escape character that can specify the insertion of characters from capture groups and names from (*MARK) or other control verbs in the pattern. Dollar is the only escape -character (backslash is treated as literal). The following forms are always +character (backslash is treated as literal). The following forms are recognized:
$$ insert a dollar character
- $<n> or ${<n>} insert the contents of group <n>
+ $n or ${n} insert the contents of group n
+ $0 or $& insert the entire matched substring
+ $` insert the substring that precedes the match
+ $' insert the substring that follows the match
+ $_ insert the entire input string
$*MARK or ${*MARK} insert a control verb name
-Either a group number or a group name can be given for <n>. Curly brackets are
-required only if the following character would be interpreted as part of the
-number or name. The number may be zero to include the entire matched string.
-For example, if the pattern a(b)c is matched with "=abc=" and the replacement
-string "+$1$0$1+", the result is "=+babcb+=".
+Either a group number or a group name can be given for n, for example $2 or
+$NAME. Curly brackets are required only if the following character would be
+interpreted as part of the number or name. The number may be zero to include
+the entire matched string. For example, if the pattern a(b)c is matched with
+"=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=".
+
++The JavaScript form $<name>, where the angle brackets are part of the syntax, +is also recognized for group names, but not for group numbers or *MARK.
$*MARK inserts the name from the last encountered backtracking control verb on @@ -3732,28 +3907,53 @@ not influence the extended substitution syntax described below. PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the replacement string. Without this option, only the dollar character is special, and only the group insertion forms listed above are valid. When -PCRE2_SUBSTITUTE_EXTENDED is set, two things change: +PCRE2_SUBSTITUTE_EXTENDED is set, several things change:
Firstly, backslash in a replacement string is interpreted as an escape -character. The usual forms such as \n or \x{ddd} can be used to specify -particular character codes, and backslash followed by any non-alphanumeric -character quotes that character. Extended quoting can be coded using \Q...\E, -exactly as in pattern strings. +character. The usual forms such as \x{ddd} can be used to specify particular +character codes, and backslash followed by any non-alphanumeric character +quotes that character. Extended quoting can be coded using \Q...\E, exactly +as in pattern strings. The escapes \b and \v are interpreted as the +characters backspace and vertical tab, respectively. +
++The interpretation of backslash followed by one or more digits is the same as +in a pattern, which in Perl has some ambiguities. Details are given in the +pcre2pattern +page. +
++The Python form \g<n>, where the angle brackets are part of the syntax and n +is either a group name or number, is recognized as an altertive way of +inserting the contents of a group, for example \g<3>.
There are also four escape sequences for forcing the case of inserted letters. -The insertion mechanism has three states: no case forcing, force upper case, -and force lower case. The escape sequences change the current state: \U and -\L change to upper or lower case forcing, respectively, and \E (when not -terminating a \Q quoted sequence) reverts to no case forcing. The sequences -\u and \l force the next character (if it is a letter) to upper or lower -case, respectively, and then the state automatically reverts to no case -forcing. Case forcing applies to all inserted characters, including those from -capture groups and letters within \Q...\E quoted sequences. If either -PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode +Case forcing applies to all inserted characters, including those from capture +groups and letters within \Q...\E quoted sequences. The insertion mechanism +has three states: no case forcing, force upper case, and force lower case. The +escape sequences change the current state: \U and \L change to upper or lower +case forcing, respectively, and \E (when not terminating a \Q quoted +sequence) reverts to no case forcing. The sequences \u and \l force the next +character (if it is a letter) to upper or lower case, respectively, and then +the state automatically reverts to no case forcing. +
++However, if \u is immediately followed by \L or \l is immediately followed +by \U, the next character's case is forced by the first escape sequence, and +subsequent characters by the second. This provides a "title casing" facility +that can be applied to group captures. For example, if group 1 has captured +"heLLo", the replacement string "\u\L$1" becomes "Hello". +
++If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater -than 127. +than 127. However, only simple case folding, as determined by the Unicode file +CaseFolding.txt is supported. PCRE2 does not support language-specific +special casing rules such as using different lower case Greek sigmas in the +middle and ends of words (as defined in the Unicode file +SpecialCasing.txt).
Note that case forcing sequences such as \U...\E do not nest. For example, @@ -3762,20 +3962,20 @@ effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do not apply to replacement strings.
-The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more +The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture group substitution. The syntax is similar to that used by Bash:
- ${<n>:-<string>}
- ${<n>:+<string1>:<string2>}
+ ${n:-string}
+ ${n:+string1:string2}
-As before, <n> may be a group number or a name. The first form specifies a
-default value. If group <n> is set, its value is inserted; if not, <string> is
-expanded and the result inserted. The second form specifies strings that are
-expanded and inserted when group <n> is set or unset, respectively. The first
-form is just a convenient shorthand for
+As in the simple case, n may be a group number or a name. The first form
+specifies a default value. If group n is set, its value is inserted; if
+not, the string is expanded and the result inserted. The second form specifies
+strings that are expanded and inserted when group n is set or unset,
+respectively. The first form is just a convenient shorthand for
- ${<n>:+${<n>}:<string>}
+ ${n:+${n}:string}
Backslash can be used to escape colons and closing curly brackets in the
replacement strings. A change of the case forcing state within a replacement
@@ -3852,9 +4052,18 @@ Substitution callouts
The pcre2_set_substitution_callout() function can be used to specify a
callout function for pcre2_substitute(). This information is passed in
a match context. The callout function is called after each substitution has
-been processed, but it can cause the replacement not to happen. The callout
-function is not called for simulated substitutions that happen as a result of
-the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option.
+been processed, but it can cause the replacement not to happen.
+
++The callout function is not called for simulated substitutions that happen as a +result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when +substitution processing exceeds the buffer space provided by the caller, +processing continues by counting code units. The simulation is unable to +populate the callout block, and so the simulation is pessimistic about the +required buffer size. Whichever is larger of accepted or rejected substitution +is reported as the required size. Therefore, the returned buffer length may be +an overestimate (without a substitution callout, it is normally an exact +measurement).
The first argument of the callout function is a pointer to a substitute callout @@ -3903,6 +4112,107 @@ PCRE2_SUBSTITUTE_GLOBAL is not set), the rest of the input is copied to the output and the call to pcre2_substitute() exits, returning the number of matches so far.
+
+int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext,
+ PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE,
+ PCRE2_UCHAR *, PCRE2_SIZE,
+ int, void *),
+ void *callout_data);
+
+
+The pcre2_set_substitution_case_callout() function can be used to specify
+a callout function for pcre2_substitute() to use when performing case
+transformations. This does not affect any case insensitivity behaviour when
+performing a match, but only the user-visible transformations performed when
+processing a substitution such as:
+
+ pcre2_substitute(..., "\\U$1", ...) ++ +
+The default case transformations applied by PCRE2 are reasonably complete, and, +in UTF or UCP mode, perform the simple locale-invariant case transformations as +specified by Unicode. This is suitable for the internal (invisible) +case-equivalence procedures used during pattern matching, but an application +may wish to use more sophisticated locale-aware processing for the user-visible +substitution transformations. +
+
+One example implementation of the callout_function using the ICU
+library would be:
+
+
+
+ PCRE2_SIZE
+ icu_case_callout(
+ PCRE2_SPTR input, PCRE2_SIZE input_len,
+ PCRE2_UCHAR *output, PCRE2_SIZE output_cap,
+ int to_case, void *data_ptr)
+ {
+ UErrorCode err = U_ZERO_ERROR;
+ int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER
+ ? u_strToLower(output, output_cap, input, input_len, NULL, &err)
+ : to_case == PCRE2_SUBSTITUTE_CASE_UPPER
+ ? u_strToUpper(output, output_cap, input, input_len, NULL, &err)
+ : u_strToTitle(output, output_cap, input, input_len, &first_char_only,
+ NULL, &err);
+ if (U_FAILURE(err)) return (~(PCRE2_SIZE)0);
+ return r;
+ }
+
+
++The first and second arguments of the case callout function are the Unicode +string to transform. +
++The third and fourth arguments are the output buffer and its capacity. +
++The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, +PCRE2_SUBSTITUTE_CASE_UPPER, or PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. +PCRE2_SUBSTITUTE_CASE_LOWER and PCRE2_SUBSTITUTE_CASE_UPPER are passed to the +callout to indicate that the case of the entire callout input should be +case-transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate that +only the first character or glyph should be transformed to Unicode titlecase +and the rest to Unicode lowercase (note that titlecasing sometimes uses Unicode +properties to titlecase each word in a string; but PCRE2 is requesting that only +the single leading character is to be titlecased). +
++The sixth argument is the callout_data supplied to +pcre2_set_substitute_case_callout(). +
++The resulting string in the destination buffer may be larger or smaller than the +input, if the casing rules merge or split characters. The return value is the +length required for the output string. If a buffer of sufficient size was +provided to the callout, then the result must be written to the buffer and the +number of code units returned. If the result does not fit in the provided +buffer, then the required capacity must be returned and PCRE2 will not make use +of the output buffer. PCRE2 provides input and output buffers which overlap, so +the callout must support this by suitable internal buffering. +
++Alternatively, if the callout wishes to indicate an error, then it may return +(~(PCRE2_SIZE)0). In this case pcre2_substitute() will immediately fail with +error PCRE2_ERROR_REPLACECASE. +
++When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH +option, there are situations when pcre2_substitute() will return an +underestimate of the required buffer size. If you call pcre2_substitute() once +with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, and the input buffer is too small for +the replacement string to be constructed, then instead of calling the case +callout, pcre2_substitute() will make an estimate of the required buffer size. +The second call should also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that +second call is not guaranteed to succeed either, if the case callout requires +more buffer space than expected. The caller must make repeated attempts in a +loop. +
int pcre2_substring_nametable_scan(const pcre2_code *code, @@ -4177,7 +4487,7 @@ Cambridge, England.
-Last updated: 24 April 2024
+Last updated: 26 December 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2build.html b/doc/html/pcre2build.html
index d4b0d33..f4e127f 100644
--- a/doc/html/pcre2build.html
+++ b/doc/html/pcre2build.html
@@ -643,7 +643,7 @@ Cambridge, England.
-Last updated: 15 April 2024
+Last updated: 16 April 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2compat.html b/doc/html/pcre2compat.html
index d60182e..5f7e280 100644
--- a/doc/html/pcre2compat.html
+++ b/doc/html/pcre2compat.html
@@ -71,7 +71,7 @@ interprets them.
7. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
built with Unicode support (the default). The properties that can be tested
with \p and \P are limited to the general category properties such as Lu and
-Nd, the derived properties Any and LC (synonym L&), script names such as Greek
+Nd, the derived properties Any and Lc (synonym L&), script names such as Greek
or Han, Bidi_Class, Bidi_Control, and a few binary properties. Both PCRE2 and
Perl support the Cs (surrogate) property, but in PCRE2 its use is limited. See
the
@@ -99,7 +99,12 @@ following examples:
\Q\\E \ \\E
The \Q...\E sequence is recognized both inside and outside character classes
-by both PCRE2 and Perl.
+by both PCRE2 and Perl. Another difference from Perl is that any appearance of
+\Q or \E inside what might otherwise be a quantifier causes PCRE2 not to
+recognize the sequence as a quantifier. Perl recognizes a quantifier if
+(redundantly) either of the numbers is inside \Q...\E, but not if the
+separating comma is. When not recognized as a quantifier a sequence such as
+{\Q1\E,2} is treated as the literal string "{1,2}".
9. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) @@ -120,7 +125,9 @@ confined to that group; it does not extend to the surrounding pattern. This is not always the case in Perl. In particular, if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are -processed as anchored at the point where they are tested. +processed as anchored at the point where they are tested. PCRE2 also confines +all control verbs within atomic assertions, again including (*THEN) in +assertions with only one branch.
12. If a pattern contains more than one backtracking control verb, the first @@ -159,11 +166,11 @@ warning features, so it gives an error in these cases because they are almost certainly user mistakes.
-17. In PCRE2, the upper/lower case character properties Lu and Ll are not -affected when case-independent matching is specified. For example, \p{Lu} -always matches an upper case letter. I think Perl has changed in this respect; -in the release at the time of writing (5.38), \p{Lu} and \p{Ll} match all -letters, regardless of case, when case independence is specified. +17. In PCRE2, until release 10.45, the upper/lower case character properties Lu +and Ll were not affected when case-independent matching was specified. Perl has +changed in this respect, and PCRE2 has now changed to match. When caseless +matching is in force, Lu, Ll, and Lt (title case) are all treated as Lc (cased +letter).
18. From release 5.32.0, Perl locks out the use of \K in lookaround
@@ -231,6 +238,10 @@ and condition references such as (?(4)...). PCRE2 supports relative group
numbers such as +2 and -4 in all three cases. Perl supports both plus and minus
for subroutine calls, but only minus for back references, and no relative
numbering at all for conditions.
+
+
+(m) The scan substring assertion (syntax (*scs:(n)...)) is a PCRE2 extension
+that is not available in Perl.
20. Perl has different limits than PCRE2. See the @@ -252,6 +263,18 @@ handled by PCRE2, either by the interpreter or the JIT. An example is /(?:|(?0)abcd)(?(R)|\z)/, which matches a sequence of any number of repeated "abcd" substrings at the end of the subject.
++23. Both PCRE2 and Perl error when \x{ escapes are invalid, but Perl tries to +recover and prints a warning if the problem was that an invalid hexadecimal +digit was found, since PCRE2 doesn't have warnings it returns an error instead. +Additionally, Perl accepts \x{} and generates NUL unlike PCRE2. +
++24. From release 10.45, PCRE2 gives an error if \x is not followed by a +hexadecimal digit or a curly bracket. It used to interpret this as the NUL +character. Perl still generates NUL, but warns when in warning mode in most +cases. +
-Last updated: 30 November 2023
+Last updated: 02 October 2024
-Copyright © 1997-2023 University of Cambridge.
+Copyright © 1997-2024 University of Cambridge.
Return to the PCRE2 index page. diff --git a/doc/html/pcre2convert.html b/doc/html/pcre2convert.html index 6b9fea5..57e8989 100644 --- a/doc/html/pcre2convert.html +++ b/doc/html/pcre2convert.html @@ -182,7 +182,7 @@ Cambridge, England.
-Last updated: 28 June 2018
+Last updated: 14 November 2023
Copyright © 1997-2018 University of Cambridge.
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index bd12246..5c9a57a 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@@ -391,9 +391,10 @@ Read patterns from the file, one per line. As is the case with patterns on the
command line, no delimiters should be used. What constitutes a newline when
reading the file is the operating system's default interpretation of \n. The
--newline option has no effect on this option. Trailing white space is
-removed from each line, and blank lines are ignored. An empty file contains no
+removed from each line, and blank lines are ignored unless the
+--posix-pattern-file option is also provided. An empty file contains no
patterns and therefore matches nothing. Patterns read from a file in this way
-may contain binary zeros, which are treated as ordinary data characters.
+may contain binary zeros, which are treated as ordinary character literals.
If this option is given more than once, all the specified files are read. A
@@ -723,9 +724,9 @@ text.
$<digits> or ${<digits>} is replaced by the captured substring of the given
-decimal number; zero substitutes the whole match. If the number is greater than
-the number of capturing substrings, or if the capture is unset, the replacement
-is empty.
+decimal number; $& (or the legacy $0) substitutes the whole match. If the
+number is greater than the number of capturing substrings, or if the capture
+is unset, the replacement is empty.
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
@@ -808,6 +809,15 @@ when in UCP mode, the sequence (?aP) restricts [:word:] to ASCII letters, while
allowing \w to match Unicode letters and digits.
+--posix-pattern-file +When patterns are provided with the -f option, do not trim trailing +spaces or ignore empty lines in a similar way than other grep tools. To keep +the behaviour consistent with older versions, if the pattern read was +terminated with CRLF (as character literals) then both characters won't be +included as part of it, so if you really need to have pattern ending in '\r', +use a escape sequence or provide it by a different method. +
+-q, --quiet Work quietly, that is, display nothing except error messages. The exit status indicates whether or not any matches were found. @@ -993,7 +1003,7 @@ scripts or echoing specific strings during matching by making use of PCRE2's callout facility. However, this support can be completely or partially disabled when pcre2grep is built. You can find out whether your binary has support for callouts by running it with the --help option. If callout support is -completely disabled, all callouts in patterns are ignored by pcre2grep. +completely disabled, callouts in patterns are forbidden by pcre2grep. If the facility is partially disabled, calling external programs is not supported, and callouts that request it are ignored.
@@ -1015,9 +1025,9 @@ available, provided that callouts were not completely disabled when zero-terminated string, which means it should not contain any internal binary zeros. It is written to the output, having first been passed through the same escape processing as text from the --output (-O) option (see -above). However, $0 cannot be used to insert a matched substring because the -match is still in progress. Instead, the single character '0' is inserted. Any -syntax errors in the string (for example, a dollar not followed by another +above). However, $0 or $& cannot be used to insert a matched substring because +the match is still in progress. Instead, the single character '0' is inserted. +Any syntax errors in the string (for example, a dollar not followed by another character) causes the callout to be ignored. No terminator is added to the output string, so if you want a newline, you must include it explicitly using the escape $n. For example: @@ -1047,9 +1057,9 @@ arguments: Any substring (including the executable name) may contain escape sequences started by a dollar character. These are the same as for the --output -(-O) option documented above, except that $0 cannot insert the matched -string because the match is still in progress. Instead, the character '0' -is inserted. If you need a literal dollar or pipe character in any +(-O) option documented above, except that $0 or $& cannot insert the +matched string because the match is still in progress. Instead, the character +'0' is inserted. If you need a literal dollar or pipe character in any substring, use $$ or $| respectively. Here is an example:echo -e "abcde\n12345" | pcre2grep \ @@ -1116,7 +1126,7 @@ Cambridge, England.
REVISION
-Last updated: 22 December 2023 +Last updated: 09 October 2024
Copyright © 1997-2023 University of Cambridge.
diff --git a/doc/html/pcre2jit.html b/doc/html/pcre2jit.html index d97d800..6835cd8 100644 --- a/doc/html/pcre2jit.html +++ b/doc/html/pcre2jit.html @@ -64,7 +64,7 @@ platforms: If --enable-jit is set on an unsupported platform, compilation fails.-A client program can tell if JIT support is available by calling +A client program can tell if JIT support has been compiled by calling pcre2_config() with the PCRE2_CONFIG_JIT option. The result is one if PCRE2 was built with JIT support, and zero otherwise. However, having the JIT code available does not guarantee that it will be used for any particular @@ -72,11 +72,19 @@ match. One reason for this is that there are a number of options and pattern items that are not supported by JIT (see below). Another reason is that in some environments JIT is unable to get -memory in which to build its compiled code. The only guarantee from +executable memory in which to build its compiled code. The only guarantee from pcre2_config() is that if it returns zero, JIT will definitely not be used.
+As of release 10.45 there is a more informative way to test for JIT support. If +pcre2_compile_jit() is called with the single option PCRE2_JIT_TEST_ALLOC +it returns zero if JIT is available and has a working allocator. Otherwise it +returns PCRE2_ERROR_NOMEMORY if JIT is available but cannot allocate executable +memory, or PCRE2_ERROR_JIT_UNSUPPORTED if JIT support is not compiled. The +code argument is ignored, so it can be a NULL value. +
+A simple program does not need to check availability in order to use JIT when possible. The API is implemented in a way that falls back to the interpretive code if JIT is not available or cannot be used for a given match. For programs @@ -126,7 +134,8 @@ option bits. For example, you can call it once with PCRE2_JIT_COMPLETE and PCRE2_JIT_COMPLETE and PCRE2_JIT_PARTIAL_HARD. This time it will ignore PCRE2_JIT_COMPLETE and just compile code for partial matching. If pcre2_jit_compile() is called with no option bits set, it immediately -returns zero. This is an alternative way of testing whether JIT is available. +returns zero. This is an alternative way of testing whether JIT support has +been compiled.
At present, it is not possible to free JIT compiled code except when the entire @@ -487,7 +496,7 @@ Cambridge, England.
REVISION
-Last updated: 21 February 2024 +Last updated: 22 August 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2limits.html b/doc/html/pcre2limits.html index 8152ed2..514c50b 100644 --- a/doc/html/pcre2limits.html +++ b/doc/html/pcre2limits.html @@ -96,7 +96,7 @@ Cambridge, England. REVISION
-Last updated: August 2023 +Last updated: 16 August 2023
Copyright © 1997-2023 University of Cambridge.
diff --git a/doc/html/pcre2matching.html b/doc/html/pcre2matching.html index 3b8b629..4d02325 100644 --- a/doc/html/pcre2matching.html +++ b/doc/html/pcre2matching.html @@ -27,7 +27,7 @@ please consult the man page, in case the conversion went wrong. This document describes the two different algorithms that are available in PCRE2 for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the pcre2_match() -function. This works in the same as Perl's matching function, and provide a +function. This works in the same as Perl's matching function, and provides a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the pcre2jit @@ -42,7 +42,7 @@ these are described below.When there is only one possible way in which a given subject string can match a pattern, the two algorithms give the same answer. A difference arises, however, -when there are multiple possibilities. For example, if the pattern +when there are multiple possibilities. For example, if the anchored pattern
^<.*>@@ -115,9 +115,9 @@ algorithm after the first match (which is necessarily the shortest) is found.Note that the size of vector needed to contain all the results depends on the -number of simultaneous matches, not on the number of parentheses in the -pattern. Using pcre2_match_data_create_from_pattern() to create the match -data block is therefore not advisable when doing DFA matching. +number of simultaneous matches, not on the number of capturing parentheses in +the pattern. Using pcre2_match_data_create_from_pattern() to create the +match data block is therefore not advisable when doing DFA matching.
Note also that all the matches that are found start at the same point in the @@ -166,37 +166,43 @@ possibilities, and PCRE2's implementation of this algorithm does not attempt to do this. This means that no captured substrings are available.
-3. Because no substrings are captured, backreferences within the pattern are -not supported. -
--4. For the same reason, conditional expressions that use a backreference as the -condition or test for a specific group recursion are not supported. -
--5. Again for the same reason, script runs are not supported. +3. Because no substrings are captured, a number of related features are not +available: +
+
+(a) Backreferences; +
+
+(b) Conditional expressions that use a backreference as the condition or test +for a specific group recursion; +
+
+(c) Script runs; +
+
+(d) Scan substring assertions.-6. Because many paths through the tree may be active, the \K escape sequence, +4. Because many paths through the tree may be active, the \K escape sequence, which resets the start of the match when encountered (but may be on some paths and not on others), is not supported.
-7. Callouts are supported, but the value of the capture_top field is +5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always 0.
-8. The \C escape sequence, which (in the standard algorithm) always matches a -single code unit, even in a UTF mode, is not supported in these modes, because +6. The \C escape sequence, which (in the standard algorithm) always matches a +single code unit, even in a UTF mode, is not supported in UTF modes because the alternative algorithm moves through the subject string one character (not code unit) at a time, for all active paths through the tree.
-9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not +7. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not supported. (*FAIL) is supported, and behaves like a failing negative assertion.
-10. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not +8. The PCRE2_MATCH_INVALID_UTF option for pcre2_compile() is not supported by pcre2_dfa_match().
ADVANTAGES OF THE ALTERNATIVE ALGORITHM
@@ -223,15 +229,18 @@ because it has to search for all possible matches, but is also because it is less susceptible to optimization.-2. Capturing parentheses, backreferences, script runs, and matching within -invalid UTF string are not supported. +2. Capturing parentheses and other features such as backreferences that rely on +them are not supported. +
++3. Matching within invalid UTF strings is not supported.
-3. Although atomic groups are supported, their use does not provide the +4. Although atomic groups are supported, their use does not provide the performance advantage that it does for the standard algorithm.
-4. JIT optimization is not supported. +5. JIT optimization is not supported.
AUTHOR
@@ -244,7 +253,7 @@ Cambridge, England.
REVISION
-Last updated: 19 January 2024 +Last updated: 30 August 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2partial.html b/doc/html/pcre2partial.html index 64116c4..067064d 100644 --- a/doc/html/pcre2partial.html +++ b/doc/html/pcre2partial.html @@ -399,7 +399,7 @@ Cambridge, England.
REVISION
-Last updated: 04 September 2019 +Last updated: 27 November 2024
Copyright © 1997-2019 University of Cambridge.
diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html index cf50c1a..84eb0aa 100644 --- a/doc/html/pcre2pattern.html +++ b/doc/html/pcre2pattern.html @@ -14,37 +14,41 @@ please consult the man page, in case the conversion went wrong.
@@ -52,9 +56,11 @@ The syntax and semantics of the regular expressions that are supported by PCRE2 are described in detail below. There is a quick-reference syntax summary in the pcre2syntax page. PCRE2 tries to match Perl syntax and semantics as closely as it can. -PCRE2 also supports some alternative regular expression syntax (which does not -conflict with the Perl syntax) in order to provide some compatibility with -regular expressions in Python, .NET, and Oniguruma. +PCRE2 also supports some alternative regular expression syntax that does not +conflict with the Perl syntax in order to provide some compatibility with +regular expressions in Python, .NET, and Oniguruma. There are in addition some +options that enable alternative syntax and semantics that are not the same as +in Perl.
Perl's regular expressions are described in its own documentation, and regular @@ -74,7 +80,19 @@ function, are discussed in the pcre2matching page.
-+Most computers use ASCII or Unicode for encoding characters, and PCRE2 assumes +this by default. However, it can be compiled to run in an environment that uses +the EBCDIC code, which is the case for some IBM mainframe operating systems. In +the sections below, character code values are ASCII or Unicode; in an EBCDIC +environment these characters may have different code values, and there are no +code points greater than 255. Differences in behaviour when PCRE2 is running in +an EBCDIC environment are described in the section +"EBCDIC environments" +below, which you can ignore unless you really are in an EBCDIC environment. +
+
A number of options that can be passed to pcre2_compile() can also be set
by special items at the start of a pattern. These are not Perl-compatible, but
@@ -141,7 +159,8 @@ Disabling auto-possessification
If a pattern starts with (*NO_AUTO_POSSESS), it has the same effect as setting
-the PCRE2_NO_AUTO_POSSESS option. This stops PCRE2 from making quantifiers
+the PCRE2_NO_AUTO_POSSESS option, or calling pcre2_set_optimize() with
+a PCRE2_AUTO_POSSESS_OFF directive. This stops PCRE2 from making quantifiers
possessive when what follows cannot match the repeated item. For example, by
default a+b is treated as a++b. For more details, see the
pcre2api
@@ -152,8 +171,9 @@ Disabling start-up optimizations
If a pattern starts with (*NO_START_OPT), it has the same effect as setting the -PCRE2_NO_START_OPTIMIZE option. This disables several optimizations for quickly -reaching "no match" results. For more details, see the +PCRE2_NO_START_OPTIMIZE option, or calling pcre2_set_optimize() with +a PCRE2_START_OPTIMIZE_OFF directive. This disables several optimizations for +quickly reaching "no match" results. For more details, see the pcre2api documentation.
@@ -162,7 +182,8 @@ Disabling automatic anchoringIf a pattern starts with (*NO_DOTSTAR_ANCHOR), it has the same effect as -setting the PCRE2_NO_DOTSTAR_ANCHOR option. This disables optimizations that +setting the PCRE2_NO_DOTSTAR_ANCHOR option, or calling pcre2_set_optimize() +with a PCRE2_DOTSTAR_ANCHOR_OFF directive. This disables optimizations that apply to patterns whose top-level branches all start with .* (match any number of arbitrary characters). For more details, see the pcre2api @@ -275,14 +296,6 @@ at compile time. This effect can also be achieved by starting a pattern with (*BSR_ANYCRLF). For completeness, (*BSR_UNICODE) is also recognized, corresponding to PCRE2_BSR_UNICODE.
--PCRE2 can be compiled to run in an environment that uses EBCDIC as its -character code instead of ASCII or Unicode (typically a mainframe system). In -the sections below, character code values are ASCII or Unicode; in an EBCDIC -environment these characters may have different code values, and there are no -code points greater than 255. -
A regular expression is a pattern that is matched against a subject string from @@ -298,7 +311,10 @@ ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) respectively when either PCRE2_UTF or PCRE2_UCP is set, unless the PCRE2_EXTRA_CASELESS_RESTRICT option is in force (either passed to -pcre2_compile() or set by (?r) within the pattern). +pcre2_compile() or set by (*CASELESS_RESTRICT) or (?r) within the +pattern). If the PCRE2_EXTRA_TURKISH_CASING option is in force (either passed +to pcre2_compile() or set by (*TURKISH_CASING) within the pattern), then +the 'i' letters are matched according to Turkish and Azeri languages.
The power of regular expressions comes from the ability to include wild cards, @@ -346,7 +362,7 @@ a character class the only metacharacters are: If a pattern is compiled with the PCRE2_EXTENDED option, most white space in the pattern, other than in a character class, within a \Q...\E sequence, or -between a # outside a character class and the next newline, inclusive, are +between a # outside a character class and the next newline, inclusive, is ignored. An escaping backslash can be used to include a white space or a # character as part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same applies, but in addition unescaped space and horizontal tab characters are @@ -404,6 +420,14 @@ by \E later in the pattern, the literal interpretation continues to the end of the pattern (that is, \E is assumed at the end). If the isolated \Q is inside a character class, this causes an error, because the character class is then not terminated by a closing square bracket. +
++Another difference from Perl is that any appearance of \Q or \E inside what +might otherwise be a quantifier causes PCRE2 not to recognize the sequence as a +quantifier. Perl recognizes a quantifier if (redundantly) either of the numbers +is inside \Q...\E, but not if the separating comma is. When not recognized as +a quantifier a sequence such as {\Q1\E,2} is treated as the literal string +"{1,2}".
+By default, after \x that is not followed by {, one or two hexadecimal +digits are read (letters can be in upper or lower case). If the character that +follows \x is neither { nor a hexadecimal digit, an error occurs. This is +different from Perl's default behaviour, which generates a NUL character, but +is in line with the behaviour of Perl's 'strict' mode in re. +
++Any number of hexadecimal digits may appear between \x{ and }. If a character +other than a hexadecimal digit appears between \x{ and }, or if there is no +terminating }, an error occurs.
Characters whose code points are less than 256 can be defined by either of the @@ -481,69 +516,54 @@ the code unit following \c has a code point less than 32 or greater than 126, a compile-time error occurs.
-When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, -\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c -escape is processed as specified for Perl in the perlebcdic document. The -only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], -^, _, or ?. Any other character provokes a compile-time error. The sequence -\c@ encodes character code 0; after \c the letters (in either case) encode -characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 -(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +For differences in the way some escapes behave in EBCDIC environments, +see section +"EBCDIC environments" +below.
+-Thus, apart from \c?, these escapes generate the same character code values as -they do in an ASCII environment, though the meanings of the values mostly -differ. For example, \cG always generates code value 7, which is BEL in ASCII -but DEL in EBCDIC. +The escape \o must be followed by a sequence of octal digits, enclosed in +braces. An error occurs if this is not the case. This escape provides a way of +specifying character code points as octal numbers greater than 0777, and it +also allows octal numbers and backreferences to be unambiguously distinguished.
-The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but -because 127 is not a control character in EBCDIC, Perl makes it generate the -APC character. Unfortunately, there are several variants of EBCDIC. In most of -them the APC character has the value 255 (hex FF), but in the one Perl calls -POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC -values, PCRE2 makes \c? generate 95; otherwise it generates 255. +If braces are not used, after \0 up to two further octal digits are read. +However, if the PCRE2_EXTRA_NO_BS0 option is set, at least one more octal digit +must follow \0 (use \00 to generate a NUL character). Make sure you supply +two digits after the initial zero if the pattern character that follows is +itself an octal digit.
-After \0 up to two further octal digits are read. If there are fewer than two -digits, just those that are present are used. Thus the sequence \0\x\015 -specifies two binary zeros followed by a CR character (code value 13). Make -sure you supply two digits after the initial zero if the pattern character that -follows is itself an octal digit. +Inside a character class, when a backslash is followed by any octal digit, up +to three octal digits are read to generate a code point. Any subsequent digits +stand for themselves. The sequences \8 and \9 are treated as the literal +characters "8" and "9".
-The escape \o must be followed by a sequence of octal digits, enclosed in -braces. An error occurs if this is not the case. This escape is a recent -addition to Perl; it provides way of specifying character code points as octal -numbers greater than 0777, and it also allows octal numbers and backreferences -to be unambiguously specified. +Outside a character class, Perl's handling of a backslash followed by a digit +other than 0 is complicated by ambiguity, and Perl has changed over time, +causing PCRE2 also to change. From PCRE2 release 10.45 there is an option +called PCRE2_EXTRA_PYTHON_OCTAL that causes PCRE2 to use Python's unambiguous +rules. The next two subsections describe the two sets of rules.
For greater clarity and unambiguity, it is best to avoid following \ by a digit greater than zero. Instead, use \o{...} or \x{...} to specify numerical -character code points, and \g{...} to specify backreferences. The following -paragraphs describe the old, ambiguous syntax. -
--The handling of a backslash followed by a digit other than 0 is complicated, -and Perl has changed over time, causing PCRE2 also to change. -
--Outside a character class, PCRE2 reads the digit and any following digits as a -decimal number. If the number is less than 10, begins with the digit 8 or 9, or -if there are at least that many previous capture groups in the expression, the -entire sequence is taken as a backreference. A description of how this -works is given -later, -following the discussion of -parenthesized groups. -Otherwise, up to three octal digits are read to form a character code. +character code points, and \g{...} to specify backreferences.
+-Inside a character class, PCRE2 handles \8 and \9 as the literal characters -"8" and "9", and otherwise reads up to three octal digits following the -backslash, using them to generate a data character. Any subsequent digits stand -for themselves. For example, outside a character class: +All the digits that follow the backslash are read as a decimal number. If the +number is less than 10, begins with the digit 8 or 9, or if there are at least +that many previous capture groups in the expression, the entire sequence is +taken as a back reference. Otherwise, up to three octal digits are read to form +a character code. For example:
\040 is another way of writing an ASCII space \40 is the same, provided there are fewer than 40 previous capture groups @@ -560,6 +580,19 @@ must not be introduced by a leading zero, because no more than three octal digits are ever read.-The property names represented by xx above are not case-sensitive, and in -accordance with Unicode's "loose matching" rules, spaces, hyphens, and -underscores are ignored. There is support for Unicode script names, Unicode -general category properties, "Any", which matches any character (including -newline), Bidi_Class, a number of binary (yes/no) properties, and some special -PCRE2 properties (described +For compatibility with Perl, negation can be specified by including a +circumflex between the opening brace and the property. For example, \p{^Lu} is +the same as \P{Lu}. + +
+Python rules for non_class backslash 1-9 +
++If there are at least three octal digits after the backslash, exactly three are +read as an octal code point number, but the value must be no greater than +\377, even in modes where higher code point values are supported. Any +subsequent digits stand for themselves. If there are fewer than three octal +digits, the sequence is taken as a decimal back reference. Thus, for example, +\12 is always a back reference, independent of how many captures there are in +the pattern. An error is generated for a reference to a non-existent capturing +group. +
+
Constraints on character values
@@ -805,7 +838,7 @@ When PCRE2 is built with Unicode support (the default), three additional escape sequences that match characters with specific properties are available. They can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are -less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points +less than U+0100 or U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Unknown script and with an unassigned type.
@@ -823,12 +856,33 @@ The extra escape sequences that provide property support are: \P{xx} a character without the xx property \X a Unicode extended grapheme cluster
+In accordance with Unicode's "loose matching" rules, ASCII white space +characters, hyphens, and underscores are ignored in the properties represented +by xx above. As well as the space character, ASCII white space can be +tab, linefeed, vertical tab, formfeed, or carriage return. +
++Some properties are specified as a name only; others as a name and a value, +separated by a colon or an equals sign. The names and values consist of ASCII +letters and digits (with one Perl-specific exception, see below). They are not +case sensitive. Note, however, that the escapes themselves, \p and \P, +are case sensitive. There are abbreviations for many names. The following +examples are all equivalent: +
+ \p{bidiclass=al}
+ \p{BC=al}
+ \p{ Bidi_Class : AL }
+ \p{ Bi-di class = Al }
+ \P{ ^ Bi-di class = Al }
+
+There is support for Unicode script names, Unicode general category properties,
+"Any", which matches any character (including newline), Bidi_Class, a number of
+binary (yes/no) properties, and some special PCRE2 properties (described
below).
Certain other Perl properties such as "InMusicalSymbols" are not supported by
PCRE2. Note that \P{Any} does not match any characters, so always causes a
@@ -844,10 +898,11 @@ Extensions") with which it is commonly used. Using the Adlam script as an
example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
\p{scx:Adlam} matches, in addition, characters that have Adlam in their
extensions list. The full names "script" and "script extensions" for the
-property types are recognized, and a equals sign is an alternative to the
-colon. If a script name is given without a property type, for example,
-\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
-interpretation at release 5.26 and PCRE2 changed at release 10.40.
+property types are recognized and, as for all property specifications, an
+equals sign is an alternative to the colon. If a script name is given without a
+property type, for example, \p{Adlam}, it is treated as \p{scx:Adlam}. Perl
+changed to this interpretation at release 5.26 and PCRE2 changed at release
+10.40.
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
@@ -865,15 +920,10 @@ The general category property for \p and \P
Each character has exactly one Unicode general category property, specified by -a two-letter abbreviation. For compatibility with Perl, negation can be -specified by including a circumflex between the opening brace and the property -name. For example, \p{^Lu} is the same as \P{Lu}. -
--If only one letter is specified with \p or \P, it includes all the general -category properties that start with that letter. In this case, in the absence -of negation, the curly brackets in the escape sequence are optional; these two -examples have the same effect: +a two-letter abbreviation. If only one letter is specified with \p or \P, it +includes all the general category properties that start with that letter. In +this case, in the absence of negation, the curly brackets in the escape +sequence are optional; these two examples have the same effect:
\p{L}
\pL
@@ -888,6 +938,7 @@ The following general category property codes are supported:
Cs Surrogate
L Letter
+ Lc Cased letter
Ll Lower case letter
Lm Modifier letter
Lo Other letter
@@ -924,9 +975,13 @@ The following general category property codes are supported:
Zp Paragraph separator
Zs Space separator
-The special property LC, which has the synonym L&, is also supported: it
-matches a character that has the Lu, Ll, or Lt property, in other words, a
-letter that is not classified as a modifier or "other".
+Perl originally used the name L& for the Lc property. This is still supported
+by Perl, but discouraged. PCRE2 also still supports it. This property matches
+any character that has the Lu, Ll, or Lt property, in other words, any letter
+that is not classified as a modifier or "other". From release 10.45 of PCRE2
+the properties Lu, Ll, and Lt are all treated as Lc when case-independent
+matching is set by the PCRE2_CASELESS option or (?i) within the pattern. The
+other properties are not affected by caseless matching.
The Cs (Surrogate) property applies only to characters whose code points are in @@ -948,11 +1003,6 @@ No character that is in the Unicode table has the Cn (unassigned) property. Instead, this property is assumed for any code point that is not in the Unicode table.
--Specifying caseless matching does not affect these escape sequences. For -example, \p{Lu} always matches only upper case letters. This is different from -the behaviour of current versions of Perl. -
There is another non-standard property, Xuc, which matches any character that @@ -1389,13 +1440,12 @@ is actually required as a member of the class, ensure it is not the first character, or escape it with a backslash.
-For example, the character class [aeiou] matches any lower case vowel, while -[^aeiou] matches any character that is not a lower case vowel. Note that a -circumflex is just a convenient notation for specifying the characters that -are in the class by enumerating those that are not. A class that starts with a -circumflex is not an assertion; it still consumes a character from the subject -string, and therefore it fails if the current pointer is at the end of the -string. +For example, the character class [aeiou] matches any lower case English vowel, +whereas [^aeiou] matches all other characters. Note that a circumflex is just a +convenient notation for specifying the characters that are in the class by +enumerating those that are not. A class that starts with a circumflex is not an +assertion; it still consumes a character from the subject string, and therefore +it fails to match if the current pointer is at the end of the string.
Characters in a class may be specified by their code points using \o, \x, or @@ -1405,7 +1455,10 @@ a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a caseful version would. Note that there are two ASCII characters, K and S, that, in addition to their lower case ASCII equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S) -respectively when either PCRE2_UTF or PCRE2_UCP is set. +respectively when either PCRE2_UTF or PCRE2_UCP is set. If you do not want +these ASCII/non-ASCII case equivalences, you can suppress them by setting +PCRE2_EXTRA_CASELESS_RESTRICT, either as an option in a compile context, or by +including (*CASELESS_RESTRICT) or (?r) within a pattern.
Characters that might indicate line breaks are never treated in any special way @@ -1437,6 +1490,12 @@ or immediately after a range. For example, [b-d-z] matches letters in the range b to d, a hyphen character, or z.
+There is some special treatment for alphabetic ranges in EBCDIC environments; +see the section +"EBCDIC environments" +below. +
+Perl treats a hyphen as a literal if it appears before or after a POSIX class (see below) or before or after a character type escape such as \d or \H. However, unless the hyphen is the last character in the class, Perl outputs a @@ -1448,9 +1507,9 @@ It is not possible to have the literal character "]" as the end character of a range. A pattern such as [W-]46] is interpreted as a class of two characters ("W" and "-") followed by a literal string "46]", so it would match "W46]" or "-46]". However, if the "]" is escaped with a backslash it is interpreted as -the end of range, so [W-\]46] is interpreted as a class containing a range -followed by two other characters. The octal or hexadecimal representation of -"]" can also be used to end a range. +the end of a range, so [W-\]46] is interpreted as a class containing a range +and two other characters. The octal or hexadecimal representation of "]" can +also be used to end a range.
Ranges normally include all code points between the start and end characters, @@ -1463,15 +1522,6 @@ this check). However, ranges such as [\x{d7ff}-\x{e000}], which include the surrogates, are always permitted.
-There is a special case in EBCDIC environments for ranges whose end points are -both specified as literal letters in the same case. For compatibility with -Perl, EBCDIC code points within the range that are not letters are omitted. For -example, [h-k] matches only four characters, even though the codes for h and k -are 0x88 and 0x92, a range of 11 code points. However, if the range is -specified numerically, for example, [\x88-\x92] or [h-\x92], all code points -are included. -
-If a range that includes letters is used when caseless matching is set, it matches the letters in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched caselessly, and in a non-UTF mode, if character @@ -1487,18 +1537,132 @@ whereas [\w] includes underscore. A positive character class should be read as something AND NOT ...".
-The only metacharacters that are recognized in character classes are backslash, -hyphen (only where it can be interpreted as specifying a range), circumflex -(only at the start), opening square bracket (only when it can be interpreted as -introducing a POSIX class name, or for a special compatibility feature - see -the next two sections), and the terminating closing square bracket. However, -escaping other non-alphanumeric characters does no harm. +The metacharacters that are recognized in character classes are backslash, +hyphen (when it can be interpreted as specifying a range), circumflex +(only at the start), and the terminating closing square bracket. An opening +square bracket is also special when it can be interpreted as introducing a +POSIX class (see +"Posix character classes" +below), or a special compatibility feature (see +"Compatibility feature for word boundaries" +below. Escaping any non-alphanumeric character in a class turns it into a +literal, whether or not it would otherwise be a metacharacter. +
++From release 10.45 PCRE2 supports Perl's (?[...]) extended character class +syntax. This can be used to perform set operations such as intersection on +character classes. +
++The syntax permitted within (?[...]) is quite different to ordinary character +classes. Inside the extended class, there is an expression syntax consisting of +"atoms", operators, and ordinary parentheses "()" used for grouping. Such +classes always have the Perl /xx modifier (PCRE2 option PCRE2_EXTENDED_MORE) +turned on within them. This means that literal space and tab characters are +ignored everywhere in the class. +
++The allowed atoms are individual characters specified by escape sequences such +as \n or \x{123}, character types such as \d, POSIX classes such as +[:alpha:], and nested ordinary (non-extended) character classes. For example, +in (?[\d & [...]]) the nested class [...] follows the usual rules for ordinary +character classes, in which parentheses are not metacharacters, and character +literals and ranges are permitted. +
++Character literals and ranges may not appear outside a nested ordinary +character class because they are not atoms in the extended syntax. The extended +syntax does not introduce any additional escape sequences, so (?[\y]) is an +unknown escape, as it would be in [\y]. +
++In the extended syntax, ^ does not negate a class (except within an +ordinary class nested inside an extended class); it is instead a binary +operator. +
++The binary operators are "&" (intersection), "|" or "+" (union), "-" +(subtraction) and "^" (symmetric difference). These are left-associative and +"&" has higher (tighter) precedence, while the others have equal lower +precedence. The one prefix unary operator is "!" (complement), with highest +precedence. +
++The PCRE2_ALT_EXTENDED_CLASS option enables an alternative to Perl's (?[...]) +syntax, allowing instead extended class behaviour inside ordinary [...] +character classes. This altered syntax for [...] classes is loosely described +by the Unicode standard UTS#18. The PCRE2_ALT_EXTENDED_CLASS option does not +prevent use of (?[...]) classes; it just changes the meaning of all +[...] classes that are not nested inside a Perl (?[...]) class. +
++Firstly, in ordinary Perl [...] syntax, an expression such as "[a[]" is a +character class with two literal characters "a" and "[", but in UTS#18 extended +classes the "[" character becomes an additional metacharacter within classes, +denoting the start of a nested class, so a literal "[" must be escaped as "\[". +
++Secondly, within the UTS#18 extended syntax, there are operators "||", "&&", +"--" and "~~" which denote character class union, intersection, subtraction, +and symmetric difference respectively. In standard Perl syntax, these would +simply be needlessly-repeated literals (except for "--" which could be the +start or end of a range). In UTS#18 extended classes these operators can be used +in constructs such as [\p{L}--[QW]] for "Unicode letters, other than Q and W". +A literal "-" at the start or end of a range must be escaped, so while "[--1]" +in Perl syntax is the range from hyphen to "1", it must be escaped as "[\--1]" +in UTS#18 extended classes. +
++Unlike Perl's (?[...]) extended classes, the PCRE2_EXTENDED_MORE option to +ignore space and tab characters is not automatically enabled for UTS#18 +extended classes, but it is honoured if set. +
++Extended UTS#18 classes can be nested, and nested classes are themselves +extended classes (unlike Perl, where nested classes must be simple classes). +For example, [\p{L}&&[\p{Thai}||\p{Greek}]] matches any letter that is in +the Thai or Greek scripts. Note that this means that no special grouping +characters (such as the parentheses used in Perl's (?[...]) class syntax) are +needed. +
++Individual class items (literal characters, literal ranges, properties such as +\d or \p{...}, and nested classes) can be combined by juxtaposition or by an +operator. Juxtaposition is the implicit union operator, and binds more tightly +than any explicit operator. Thus a sequence of literals and/or ranges behaves +as if it is enclosed in square brackets. For example, [A-Z0-9&&[^E8]] is the +same as [[A-Z0-9]&&[^E8]], which matches any upper case alphanumeric character +except "E" or "8". +
++Precedence between the explicit operators is not defined, so mixing operators +is a syntax error. For example, [A&&B--C] is an error, but [A&&[B--C]] is +valid.
-+This is an emerging syntax which is being adopted gradually across the regex +ecosystem: for example JavaScript adopted the "/v" flag in ECMAScript 2024; +Python's "re" module reserves the syntax for future use with a FutureWarning +for unescaped use of "[" as a literal within character classes. Due to UTS#18 +providing insufficient guidance, engines interpret the syntax differently. +Rust's "regex" crate and Python's "regex" PyPi module both implement UTS#18 +extended classes, but with slight incompatibilities ([A||B&&C] is parsed as +[A||[B&&C]] in Python's "regex" but as [[A||B]&&C] in Rust's "regex"). +
++PCRE2's syntax adds syntax restrictions similar to ECMASCript's /v flag, so +that all the UTS#18 extended classes accepted as valid by PCRE2 have the +property that they are interpreted either with the same behaviour, or as +invalid, by all other major engines. Please file an issue if you are aware of +cross-engine differences in behaviour between PCRE2 and another major engine. +
+Perl supports the POSIX notation for character classes. This uses names enclosed by [: and :] within the enclosing square brackets. PCRE2 also supports -this notation. For example, +this notation, in both ordinary and extended classes. For example,
[01[:alpha:]%]@@ -1584,7 +1748,7 @@ property. [:xdigit:] In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" versions of those characters, whose Unicode code points start at U+FF10. This -is a change that was made in PCRE release 10.43 for Perl compatibility. +is a change that was made in PCRE2 release 10.43 for Perl compatibility.
The other POSIX classes are unchanged by PCRE2_UCP, and match only characters @@ -1597,8 +1761,8 @@ just [:digit:] and [:xdigit:]. Within a pattern, this can be set and unset by (?aT) and (?-aT). The PCRE2_EXTRA_ASCII_POSIX option disables UCP processing for all POSIX classes, including [:digit:] and [:xdigit:]. Within a pattern, (?aP) and (?-aP) set and unset both these options for consistency. -
-In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and [[:>:]] is used for matching "start of word" and "end of @@ -1619,7 +1783,7 @@ used above in order to give exactly the POSIX behaviour. Note also that the PCRE2_UCP option changes the meaning of \w (and therefore \b) by default, so it also affects these POSIX sequences.
-Vertical bar characters are used to separate alternative patterns. For example, the pattern @@ -1634,7 +1798,7 @@ that succeeds is used. If the alternatives are within a group "succeeds" means matching the rest of the main pattern as well as the alternative in the group.
-The settings of several options can be changed within a pattern by a sequence of letters enclosed between "(?" and ")". The following are Perl-compatible, @@ -1732,7 +1896,7 @@ PCRE2_UTF and PCRE2_UCP options, respectively. However, the application can set the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, which lock out the use of the (*UTF) and (*UCP) sequences.
-Groups are delimited by parentheses (round brackets), which can be nested. Turning part of a pattern into a group does two things: @@ -1788,7 +1952,7 @@ from left to right, and options are not reset until the end of the group is reached, an option setting in one branch does affect subsequent branches, so the above patterns match "SUNDAY" as well as "Saturday".
-Perl 5.10 introduced a feature whereby each alternative in a group uses the same numbers for its capturing parentheses. Such a group starts with (?| and is @@ -1834,7 +1998,7 @@ true if any group with that number has matched. An alternative approach to using this "branch reset" feature is to use duplicate named groups, as described in the next section.
-Identifying capture groups by number is simple, but it can be very hard to keep track of the numbers in complicated patterns. Furthermore, if an expression is @@ -1954,7 +2118,7 @@ capture groups, see the pcre2api documentation.
-Repetition is specified by quantifiers, which may follow any one of these items: @@ -2118,8 +2282,9 @@ one succeeds. Consider this pattern: (?>.*?a)b It matches "ab" in the subject "aab". The use of the backtracking control verbs -(*PRUNE) and (*SKIP) also disable this optimization, and there is an option, -PCRE2_NO_DOTSTAR_ANCHOR, to do so explicitly. +(*PRUNE) and (*SKIP) also disable this optimization. To do so explicitly, +either pass the compile option PCRE2_NO_DOTSTAR_ANCHOR, or call +pcre2_set_optimize() with a PCRE2_DOTSTAR_ANCHOR_OFF directive.
When a capture group is repeated, the value captured is the substring that @@ -2135,7 +2300,7 @@ captured values may have been set in previous iterations. For example, after matches "aba" the value of the second captured substring is "b".
-With both maximizing ("greedy") and minimizing ("ungreedy" or "lazy") repetition, failure of what follows normally causes the repeated item to be @@ -2216,8 +2381,9 @@ package, and PCRE1 copied it from there. It found its way into Perl at release PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. -This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting -the pattern with (*NO_AUTO_POSSESS). +This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, by calling +pcre2_set_optimize() with a PCRE2_AUTO_POSSESS_OFF directive, or by +starting the pattern with (*NO_AUTO_POSSESS).
When a pattern contains an unlimited repeat inside a group that can itself be @@ -2245,7 +2411,7 @@ an atomic group, like this: sequences of non-digits cannot be broken, and failure happens quickly.
-Outside a character class, a backslash followed by a digit greater than 0 (and possibly further digits) is a backreference to a capture group earlier (that @@ -2383,23 +2549,32 @@ cause the group that they reference to be treated as an This restriction no longer applies, and backtracking into such groups can occur as normal.
--An assertion is a test on the characters following or preceding the current -matching point that does not consume any characters. The simple assertions -coded as \b, \B, \A, \G, \Z, \z, ^ and $ are described +An assertion is a test that does not consume any characters. The test must +succeed for the match to continue. The simple assertions coded as \b, \B, +\A, \G, \Z, \z, ^ and $ are described above.
-More complicated assertions are coded as parenthesized groups. There are two -kinds: those that look ahead of the current position in the subject string, and -those that look behind it, and in each case an assertion may be positive (must -match for the assertion to be true) or negative (must not match for the -assertion to be true). An assertion group is matched in the normal way, -and if it is true, matching continues after it, but with the matching position +More complicated assertions are coded as parenthesized groups. If matching such +a group succeeds, matching continues after it, but with the matching position in the subject string reset to what it was before the assertion was processed.
+A special kind of assertion, called a "scan substring" assertion, matches a +subpattern against a previously captured substring. This is described in the +section entitled +"Scan substring assertions" +below. It is a PCRE2 extension, not compatible with Perl. +
++The other goup-based assertions are of two kinds: those that look ahead of the +current position in the subject string, and those that look behind it, and in +each case an assertion may be positive (must match for the assertion to be +true) or negative (must not match for the assertion to be true). +
+The Perl-compatible lookaround assertions are atomic. If an assertion is true, but there is a subsequent matching failure, there is no backtracking into the assertion. However, there are some cases where non-atomic assertions can be @@ -2624,7 +2799,7 @@ preceded by "foo", while is another pattern that matches "foo" preceded by three digits and any three characters that are not "999".
-Traditional lookaround assertions are atomic. That is, if an assertion is true, but there is a subsequent matching failure, there is no backtracking into the @@ -2683,8 +2858,67 @@ contain any control verbs such as (*ACCEPT). (This may change in future). Note that assertions that appear as conditions for conditional groups (see below) must be atomic. +
++A special kind of assertion, not compatible with Perl, makes it possible to +check the contents of a captured substring by matching it with a subpattern. +Because this involves capturing, this feature is not supported by +pcre2_dfa_match(). +
++A scan substring assertion starts with the sequence (*scan_substring: or +(*scs: which is followed by a list of substring numbers (absolute or relative) +and/or substring names enclosed in single quotes or angle brackets, all within +parentheses. The rest of the item is the subpattern that is applied to the +substring, as shown in these examples: +
+ (*scan_substring:(1)...)
+ (*scs:(-2)...)
+ (*scs:('AB')...)
+ (*scs:(1,'AB',-2)...)
+
+The list of groups is checked in the order they are given, and it is the
+contents of the first one that is found to be set that are scanned. When
+PCRE2_DUPNAMES is set and there are ambiguous group names, all groups with the
+same name are checked in numerical order. A scan substring assertion fails if
+none of the groups it references have been set.
-+The pattern match on the substring is always anchored, that is, it must match +from the start of the substring. There is no "bumpalong" if it does not match +at the start. The end of the subject is temporarily reset to be the end of the +substring, so \Z, \z, and $ will match there. However, the start of the +subject is not reset. This means that ^ matches only if the substring is +actually at the start of the main subject, but it also means that lookbehind +assertions into what precedes the substring are possible. +
++Here is a very simple example: find a word that contains the rare (in English) +sequence of letters "rh" not at the start: +
+ \b(\w++)(*scs:(1).+rh) ++The first group captures a word which is then scanned by the second group. +This example does not actually need this heavyweight feature; the same match +can be achieved with: +
+ \b\w+?rh\w*\b ++When things are more complicated, however, scanning a captured substring can be +a useful way to describe the required match. For exmple, there is a rather +complicated pattern in the PCRE2 test data that checks an entire subject string +for a palindrome, that is, the sequence of letters is the same in both +directions. Suppose you want to search for individual words of two or more +characters such as "level" that are palindromes: +
+ (\b\w{2,}+\b)(*scs:(1)...palindrome-matching-pattern...)
+
+Within a substring scanning subpattern, references to other groups work as
+normal. Capturing groups may appear, and will retain their values during
+ongoing matching if the assertion succeeds.
+
+In concept, a script run is a sequence of characters that are all from the same Unicode script such as Latin or Greek. However, because some scripts are @@ -2746,7 +2980,7 @@ parentheses. should not be used within a script run group, because it causes an immediate exit from the group, bypassing the script run checking.
-It is possible to cause the matching process to obey a pattern fragment conditionally or to choose between two alternative fragments, depending on @@ -2947,13 +3181,13 @@ positive and negative assertions, because matching always continues after the assertion, whether it succeeds or fails. (Compare non-conditional assertions, for which captures are retained only for positive assertions that succeed.)
-There are two ways of including comments in patterns that are processed by PCRE2. In both cases, the start of the comment must not be in a character class, nor in the middle of any other sequence of related characters such as -(?: or a group name or number. The characters that make up a comment play -no part in the pattern matching. +(?: or a group name or number or a Unicode property name. The characters that +make up a comment play no part in the pattern matching.
The sequence (?# marks the start of a comment that continues up to the next @@ -2977,7 +3211,7 @@ a newline in the pattern. The sequence \n is still literal at this stage, so it does not terminate the comment. Only an actual character with the code value 0x0a (the default newline) does so.
-Consider the problem of matching a string in parentheses, allowing for unlimited nested parentheses. Without the use of recursion, the best that can @@ -3165,7 +3399,7 @@ alternative matches "a" and then recurses. In the recursion, \1 does now match "b" and so the whole match succeeds. This match used to fail in Perl, but in later versions (I tried 5.024) it now works.
-If the syntax for a recursive group call (either by number or by name) is used outside the parentheses to which it refers, it operates a bit like a subroutine @@ -3213,7 +3447,7 @@ in groups when called as subroutines is described in the section entitled "Backtracking verbs in subroutines" below.
-For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either in angle brackets or single quotes, is an alternative @@ -3231,7 +3465,7 @@ plus or a minus sign it is taken as a relative reference. For example: Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are not synonymous. The former is a backreference; the latter is a subroutine call.
-Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl code to be obeyed in the middle of matching a regular expression. This makes it @@ -3244,7 +3478,9 @@ code. The feature is called "callout". The caller of PCRE2 provides an external function by putting its entry point in a match context using the function pcre2_set_callout(), and then passing that context to pcre2_match() or pcre2_dfa_match(). If no match context is passed, or if the callout -entry point is set to NULL, callouts are disabled. +entry point is set to NULL, callout points will be passed over silently during +matching. To disallow callouts in the pattern syntax, you may use the +PCRE2_EXTRA_NEVER_CALLOUT option.
Within a regular expression, (?C<arg>) indicates a point at which the external @@ -3307,7 +3543,7 @@ example: The doubling is removed before the string is passed to the callout function.
-There are a number of special "Backtracking Control Verbs" (to use Perl's terminology) that modify the behaviour of backtracking during matching. They @@ -3347,8 +3583,8 @@ not there. Any number of these verbs may occur in a pattern. Except for
Since these verbs are specifically related to backtracking, most of them can be used only when the pattern is to be matched using the traditional matching -function, because that uses a backtracking algorithm. With the exception of -(*FAIL), which behaves like a failing negative assertion, the backtracking +function or JIT, because they use backtracking algorithms. With the exception +of (*FAIL), which behaves like a failing negative assertion, the backtracking control verbs cause an error if encountered by the DFA matching function.
@@ -3369,7 +3605,8 @@ minimum length of matching subject, or that a particular character must be present. When one of these optimizations bypasses the running of a match, any included backtracking verbs will not, of course, be processed. You can suppress the start-of-match optimizations by setting the PCRE2_NO_START_OPTIMIZE option -when calling pcre2_compile(), or by starting the pattern with +when calling pcre2_compile(), by calling pcre2_set_optimize() with a +PCRE2_START_OPTIMIZE_OFF directive, or by starting the pattern with (*NO_START_OPT). There is more discussion of this option in the section entitled "Compiling a pattern" @@ -3502,7 +3739,8 @@ attempts starting at "P" and then with an empty string do not get as far as the
If you are interested in (*MARK) values after failed matches, you should -probably set the PCRE2_NO_START_OPTIMIZE option +probably either set the PCRE2_NO_START_OPTIMIZE option or call +pcre2_set_optimize() with a PCRE2_START_OPTIMIZE_OFF directive (see above) to ensure that the match is always attempted.
@@ -3514,9 +3752,9 @@ The following verbs do nothing when they are encountered. Matching continues with what follows, but if there is a subsequent match failure, causing a backtrack to the verb, a failure is forced. That is, backtracking cannot pass to the left of the verb. However, when one of these verbs appears inside an -atomic group or in a lookaround assertion that is true, its effect is confined -to that group, because once the group has been matched, there is never any -backtracking into it. Backtracking from beyond an assertion or an atomic group +atomic group or in an atomic lookaround assertion that is true, its effect is +confined to that group, because once the group has been matched, there is never +any backtracking into it. Backtracking from beyond an atomic assertion or group ignores the entire group, and seeks a preceding backtracking point.@@ -3782,9 +4020,11 @@ into the assertion. Note in particular that a (*MARK) name that is set in an assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.
-PCRE2 now supports non-atomic positive assertions, as described in the section -entitled +PCRE2 now supports non-atomic positive assertions and also "scan substring" +assertions, as described in the sections entitled "Non-atomic assertions" +and +"Scan substring assertions" above. These assertions must be standalone (not used as conditions). They are not Perl-compatible. For these assertions, a later backtrack does jump back into the assertion, and therefore verbs such as (*COMMIT) can be triggered by @@ -3793,7 +4033,8 @@ backtracks from later in the pattern.
The effect of (*THEN) is not allowed to escape beyond an assertion. If there are no more branches to try, (*THEN) causes a positive assertion to be false, -and a negative assertion to be true. +and a negative assertion to be true. This behaviour differs from Perl when the +assertion has only one branch.
The other backtracking verbs are not treated specially if they appear in a @@ -3829,13 +4070,57 @@ then a backtrack at the outer level. enclosing group that has alternatives (its normal behaviour). However, if there is no such group within the subroutine's group, the subroutine match fails and there is a backtrack at the outer level. +
++Differences in the way PCRE behaves when it is running in an EBCDIC environment +are covered in this section. +
++When PCRE2 is compiled in EBCDIC mode, \N{U+hhh..} is not supported. \a, \e, +\f, \n, \r, and \t generate the appropriate EBCDIC code values. The \c +escape is processed as specified for Perl in the perlebcdic document. The +only characters that are allowed after \c are A-Z, a-z, or one of @, [, \, ], +^, _, or ?. Any other character provokes a compile-time error. The sequence +\c@ encodes character code 0; after \c the letters (in either case) encode +characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 +(hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F). +
++Thus, apart from \c?, these escapes generate the same character code values as +they do in an ASCII or Unicode environment, though the meanings of the values +mostly differ. For example, \cG always generates code value 7, which is BEL in +ASCII but DEL in EBCDIC. +
++The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but +because 127 is not a control character in EBCDIC, Perl makes it generate the +APC character. Unfortunately, there are several variants of EBCDIC. In most of +them the APC character has the value 255 (hex FF), but in the one Perl calls +POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC +values, PCRE2 makes \c? generate 95; otherwise it generates 255. +
++In character classes there is a special case in EBCDIC environments for ranges +whose end points are both specified as literal letters in the same case. For +compatibility with Perl, EBCDIC code points within the range that are not +letters are omitted. For example, [h-k] matches only four characters, even +though the EBCDIC codes for h and k are 0x88 and 0x92, a range of 11 code +points. However, if the range is specified numerically, for example, +[\x88-\x92] or [h-\x92], all code points are included.
-pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2syntax(3), pcre2(3).
-
Philip Hazel
@@ -3844,9 +4129,9 @@ Retired from University Computing Service
Cambridge, England.
-Last updated: 04 June 2024
+Last updated: 27 November 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2perform.html b/doc/html/pcre2perform.html
index 55fdf20..b595119 100644
--- a/doc/html/pcre2perform.html
+++ b/doc/html/pcre2perform.html
@@ -271,7 +271,7 @@ Cambridge, England.
-Last updated: 27 July 2022
+Last updated: 06 December 2022
Copyright © 1997-2022 University of Cambridge.
diff --git a/doc/html/pcre2posix.html b/doc/html/pcre2posix.html
index 6e7abd9..bc60c3b 100644
--- a/doc/html/pcre2posix.html
+++ b/doc/html/pcre2posix.html
@@ -171,7 +171,7 @@ REG_UTF. Note that REG_NOSPEC is not part of the POSIX standard.
When a pattern that is compiled with this flag is passed to
pcre2_regexec() for matching, the nmatch and pmatch arguments
-are ignored, and no captured strings are returned. Versions of the PCRE library
+are ignored, and no captured strings are returned. Versions of the PCRE2 library
prior to 10.22 used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this
no longer happens because it disables the use of backreferences.
@@ -370,7 +370,7 @@ Cambridge, England.
REVISION
-Last updated: 19 January 2024 +Last updated: 27 November 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2sample.html b/doc/html/pcre2sample.html index 345df03..0903f04 100644 --- a/doc/html/pcre2sample.html +++ b/doc/html/pcre2sample.html @@ -101,7 +101,7 @@ Cambridge, England. REVISION
-Last updated: 02 February 2016 +Last updated: 14 November 2023
Copyright © 1997-2016 University of Cambridge.
diff --git a/doc/html/pcre2serialize.html b/doc/html/pcre2serialize.html index 19418a8..d189bde 100644 --- a/doc/html/pcre2serialize.html +++ b/doc/html/pcre2serialize.html @@ -203,7 +203,7 @@ Cambridge, England.
REVISION
-Last updated: 27 June 2018 +Last updated: 19 January 2024
Copyright © 1997-2018 University of Cambridge.
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html index 1c0ccb0..46da3d7 100644 --- a/doc/html/pcre2syntax.html +++ b/doc/html/pcre2syntax.html @@ -24,34 +24,41 @@ please consult the man page, in case the conversion went wrong.
-The full syntax and semantics of the regular expressions that are supported by -PCRE2 are described in the +The full syntax and semantics of the regular expression patterns that are +supported by PCRE2 are described in the pcre2pattern -documentation. This document contains a quick-reference summary of the syntax. +documentation. This document contains a quick-reference summary of the pattern +syntax followed by the syntax of replacement strings in substitution function. +The full description of the latter is in the +pcre2api +documentation.
@@ -60,7 +67,10 @@ documentation. This document contains a quick-reference summary of the syntax. \Q...\E treat enclosed characters as literal Note that white space inside \Q...\E is always treated as literal, even if -PCRE2_EXTENDED is set, causing most other white space to be ignored. +PCRE2_EXTENDED is set, causing most other white space to be ignored. Note also +that PCRE2's handling of \Q...\E has some differences from Perl's. See the +pcre2pattern +documentation for details.
@@ -91,6 +101,11 @@ sequence causes an error. \xhh character with hex code hh \x{hh..} character with hex code hh.. +\N{U+hh..} is synonymous with \x{hh..} but is not supported in environments +that use EBCDIC code (mainly IBM mainframes). Note that \N not followed by an +opening curly bracket has a different meaning (see below). +
+If PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set ("ALT_BSUX mode"), the following are also recognized:
@@ -98,7 +113,7 @@ following are also recognized:
\uhhhh character with hex code hhhh
\u{hh..} character with hex code hh.. but only for EXTRA_ALT_BSUX
-When \x is not followed by {, from zero to two hexadecimal digits are read,
+When \x is not followed by {, one or two hexadecimal digits are read,
but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be
recognized as a hexadecimal escape; otherwise it matches a literal "x".
Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits
@@ -112,9 +127,7 @@ a non-zero digit is complicated; for details see the section
in the
pcre2pattern
documentation, where details of escape processing in EBCDIC environments are
-also given. \N{U+hh..} is synonymous with \x{hh..} in PCRE2 but is not
-supported in EBCDIC environments. Note that \N not followed by an opening
-curly bracket has a different meaning (see below).
+also given.
@@ -154,8 +167,9 @@ sequences to matching only ASCII characters.
Property descriptions in \p and \P are matched caselessly; hyphens, -underscores, and white space are ignored, in accordance with Unicode's "loose -matching" rules. +underscores, and ASCII white space characters are ignored, in accordance with +Unicode's "loose matching" rules. For example, \p{Bidi_Class=al} is the same +as \p{ bidi class = AL }.
@@ -168,13 +182,13 @@ matching" rules. Cs Surrogate L Letter + Lc Cased letter, the union of Ll, Lu, and Lt + L& Synonym of Lc Ll Lower case letter Lm Modifier letter Lo Other letter Lt Title case letter Lu Upper case letter - Lc Ll, Lu, or Lt - L& Ll, Lu, or Lt M Mark Mc Spacing mark @@ -205,7 +219,9 @@ matching" rules. Zl Line separator Zp Paragraph separator Zs Space separator - + +From release 10.45, when caseless matching is set, Ll, Lu, and Lt are all +equivalent to Lc.
@@ -268,7 +284,7 @@ The recognized classes are: RLI right-to-left isolate RLO right-to-left override S segment separator - WS which space + WS white space
+When PCRE2_ALT_EXTENDED_CLASS is set, UTS#18 extended character classes may be +used, allowing nested character classes, combined using set operators. +
+ [x&&[^y]] UTS#18 extended character class + + x||y set union (OR) + x&&y set intersection (AND) + x--y set difference (AND NOT) + x~~y set symmetric difference (XOR) + ++ +
+
+ (?[...]) Perl extended character class
+ (?[\p{Thai} & \p{Nd}]) operators; whitespace ignored
+ (?[(x - y) & z]) parentheses for grouping
+
+ (?[ [^3] & \p{Nd} ]) [...] is a nested ordinary class
+ (?[ [:alpha:] - [z] ]) POSIX set is allowed outside [...]
+ (?[ \d - [3] ]) backslash-escaped set is allowed outside [...]
+ (?[ !\n & [:ascii:] ]) backslash-escaped character is allowed outside [...]
+ all other characters or ranges must be enclosed in [...]
+
+ x|y, x+y set union (OR)
+ x&y set intersection (AND)
+ x-y set difference (AND NOT)
+ x^y set symmetric difference (XOR)
+ !x set complement (NOT)
+
+Inside a Perl extended character class, [...] switches mode to be interpreted
+as an ordinary character class. Outside of a nested [...], the only items
+permitted are backslash-escapes, POSIX sets, operators, and parentheses. Inside
+a nested ordinary class, ^ has its usual meaning (inverts the class when used
+as the first character); outside of a nested class, ^ is the XOR operator.
+
+
? 0 or 1, greedy
@@ -323,7 +377,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
{,m}? zero up to m, lazy
-
\b word boundary @@ -341,7 +395,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use \G first matching position in subject-
\K set reported start of match @@ -351,13 +405,13 @@ for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ignored in negative ones. --
ALTERNATION
+
ALTERNATION
expr|expr|expr...-
CAPTURING
+
CAPTURING
(...) capture group @@ -372,20 +426,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits; in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit. -(?aP) implies (?aT) as well, though this has no additional effect. However, it -means that (?-aP) is really (?-PT) which disables all ASCII restrictions for +means that (?-aP) also implies (?-aT) and disables all ASCII restrictions for POSIX classes.
ATOMIC GROUPS
+
ATOMIC GROUPS
(?>...) atomic non-capture group (*atomic:...) atomic non-capture group-
COMMENT
+
COMMENT
(?#....) comment (not nestable)-
OPTION SETTING
+
OPTION SETTING
Changes of these options within a group are automatically cancelled at the end of the group. @@ -409,7 +463,7 @@ of the group. (?^) unset imnrsx options
@@ -421,20 +475,22 @@ example (?i:...).
The following are recognized only at the very start of a pattern or after one -of the newline or \R options with similar syntax. More than one of them may -appear. For the first three, d is a decimal number. -
- (*LIMIT_DEPTH=d) set the backtracking limit to d - (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes - (*LIMIT_MATCH=d) set the match limit to d - (*NOTEMPTY) set PCRE2_NOTEMPTY when matching - (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching - (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) +of the newline or \R sequences or options with similar syntax. More than one +of them may appear. For the first three, d is a decimal number. +-+ (*LIMIT_DEPTH=d) set the backtracking limit to d + (*LIMIT_HEAP=d) set the heap size limit to d * 1024 bytes + (*LIMIT_MATCH=d) set the match limit to d + (*CASELESS_RESTRICT) set PCRE2_EXTRA_CASELESS_RESTRICT when matching + (*NOTEMPTY) set PCRE2_NOTEMPTY when matching + (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching + (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) - (*NO_JIT) disable JIT optimization - (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) - (*UTF) set appropriate UTF mode for the library in use - (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) + (*NO_JIT) disable JIT optimization + (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) + (*TURKISH_CASING) set PCRE2_EXTRA_TURKISH_CASING when matching + (*UTF) set appropriate UTF mode for the library in use + (*UCP) set PCRE2_UCP (use Unicode properties for \d etc)Note that LIMIT_DEPTH, LIMIT_HEAP, and LIMIT_MATCH can only reduce the value of the limits set by the caller of pcre2_match() or pcre2_dfa_match(), @@ -442,7 +498,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time. -
NEWLINE CONVENTION
+
NEWLINE CONVENTION
These are recognized only at the very start of the pattern or after option settings with a similar syntax. @@ -455,7 +511,7 @@ settings with a similar syntax. (*NUL) the NUL character (binary zero)
WHAT \R MATCHES
+
WHAT \R MATCHES
These are recognized only at the very start of the pattern or after option setting with a similar syntax. @@ -464,7 +520,7 @@ setting with a similar syntax. (*BSR_UNICODE) any Unicode newline sequence
(?=...) ) @@ -490,7 +546,7 @@ the maximum for each branch is limited to a value set by the caller of (ultimate default 255). If every branch matches a fixed number of characters, the limit for each branch is 65535 characters. -
NON-ATOMIC LOOKAROUND ASSERTIONS
+
NON-ATOMIC LOOKAROUND ASSERTIONS
These assertions are specific to PCRE2 and are not Perl-compatible.
@@ -503,7 +559,24 @@ These assertions are specific to PCRE2 and are not Perl-compatible. (*non_atomic_positive_lookbehind:...) )-
SCRIPT RUNS
+
SUBSTRING SCAN ASSERTION
++This feature is not Perl-compatible. +
+ (*scan_substring:(grouplist)...) scan captured substring + (*scs:(grouplist)...) scan captured substring ++The comma-separated list may identify groups in any of the following ways: ++ n absolute reference + +n relative reference + -n relative reference + <name> name + 'name' name + ++ +
SCRIPT RUNS
(*script_run:...) ) script run, can be backtracked into @@ -513,7 +586,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. (*asr:...) )-
BACKREFERENCES
+
BACKREFERENCES
\n reference by number (can be ambiguous) @@ -530,7 +603,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. (?P=name) reference by name (Python)-
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
+
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
(?R) recurse whole pattern @@ -549,7 +622,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. \g'-n' call subroutine by relative number (PCRE2 extension)-
CONDITIONAL PATTERNS
+
CONDITIONAL PATTERNS
(?(condition)yes-pattern) @@ -572,7 +645,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. -
BACKTRACKING CONTROL
+
BACKTRACKING CONTROL
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour @@ -599,7 +672,7 @@ pattern is not anchored. The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call.
-
CALLOUTS
+
CALLOUTS
(?C) callout (assumed number 0) @@ -610,12 +683,58 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it. -
SEE ALSO
+
REPLACEMENT STRINGS
++If the PCRE2_SUBSTITUTE_LITERAL option is set, a replacement string for +pcre2_substitute() is not interpreted. Otherwise, by default, the only +special character is the dollar character in one of the following forms: +
+ $$ insert a dollar character + $n or ${n} insert the contents of group n + $<name> insert the contents of named group + $0 or $& insert the entire matched substring + $` insert the substring that precedes the match + $' insert the substring that follows the match + $_ insert the entire input string + $*MARK or ${*MARK} insert a control verb name ++For ${n}, n can be a name or a number. If PCRE2_SUBSTITUTE_EXTENDED is set, +there is additional interpretation: + ++1. Backslash is an escape character, and the forms described in "ESCAPED +CHARACTERS" above are recognized. Also: +
+ \Q...\E can be used to suppress interpretation + \l force the next character to lower case + \u force the next character to upper case + \L force subsequent characters to lower case + \U force subsequent characters to upper case + \u\L force next character to upper case, then all lower + \l\U force next character to lower case, then all upper + \E end \L or \U case forcing + \b backspace character (note: as in character class in pattern) + \v vertical tab character (note: not the same as in a pattern) ++2. The Python form \g<n>, where the angle brackets are part of the syntax and +n is either a group name or a number, is recognized as an alternative way +of inserting the contents of a group, for example \g<3>. + ++3. Capture substitution supports the following additional forms: +
+ ${n:-string} default for unset group + ${n:+string1:string2} values for set/unset group ++The substitution strings themselves are expanded. Backslash can be used to +escape colons and closing curly brackets. + +
SEE ALSO
pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3).
-
AUTHOR
+
AUTHOR
Philip Hazel
-
@@ -624,11 +743,11 @@ Retired from University Computing Service Cambridge, England.
REVISION
+
REVISION
-Last updated: 12 October 2023 +Last updated: 27 November 2024
-Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.
Return to the PCRE2 index page. diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html index 6cc3cc3..db9073f 100644 --- a/doc/html/pcre2test.html +++ b/doc/html/pcre2test.html @@ -105,8 +105,8 @@ Input for the 16-bit and 32-bit libraries
When testing the 16-bit or 32-bit libraries, there is a need to be able to generate character code points greater than 255 in the strings that are passed -to the library. For subject lines, backslash escapes can be used. In addition, -when the utf modifier (see +to the library. For subject lines and some patterns, backslash escapes can be +used. In addition, when the utf modifier (see "Setting compilation options" below) is set, the pattern and any following subject lines are interpreted as UTF-8 strings and translated to UTF-16 or UTF-32 as appropriate. @@ -125,9 +125,8 @@ UTF-8 (in its original definition) is not capable of encoding values greater than 0x7fffffff, but such values can be handled by the 32-bit library. When testing this library in non-UTF mode with utf8_input set, if any character is preceded by the byte 0xff (which is an invalid byte in UTF-8) -0x80000000 is added to the character's value. This is the only way of passing -such code points in a pattern string. For subject strings, using an escape -sequence is preferable. +0x80000000 is added to the character's value. For subject strings, using an +escape sequence is preferable.
COMMAND LINE OPTIONS
@@ -178,8 +177,8 @@ functionality is intended for use in scripts such as RunTest. The following options output the value and set the exit code as indicated:
ebcdic-nl the code for LF (= NL) in an EBCDIC environment: - 0x15 or 0x25 - 0 if used in an ASCII environment + either 0x15 or 0x25 + 0 if used in an ASCII/Unicode environment exit code is always 0 linksize the configured internal link size (2, 3, or 4) exit code is set to the link size @@ -201,6 +200,16 @@ to the same value: pcre2-8 the 8-bit library was built unicode Unicode support is available+Note that the availability of JIT support in the library does not guarantee +that it can actually be used because in some environments it is unable to +allocate executable memory. The option "jitusable" gives more detailed +information. It returns one of the following values: ++ 0 JIT is available and usable + 1 JIT is available but cannot allocate executable memory + 2 JIT is not available + 3 Unexpected return from test call to pcre2_jit_compile() +If an unknown option is given, an error message is output; the exit code is 0.@@ -527,39 +536,48 @@ space is removed, and the line is scanned for backslash escapes, unless the subject_literal modifier was set for the pattern. The following provide a means of encoding non-printing characters in a visible way:
- \a alarm (BEL, \x07) - \b backspace (\x08) - \e escape (\x27) - \f form feed (\x0c) - \n newline (\x0a) - \r carriage return (\x0d) - \t tab (\x09) - \v vertical tab (\x0b) - \nnn octal character (up to 3 octal digits); always - a byte unless > 255 in UTF-8 or 16-bit or 32-bit mode - \o{dd...} octal character (any number of octal digits} - \xhh hexadecimal byte (up to 2 hex digits) - \x{hh...} hexadecimal character (any number of hex digits) + \a alarm (BEL, \x07) + \b backspace (\x08) + \e escape (\x27) + \f form feed (\x0c) + \n newline (\x0a) + \N{U+hh...} unicode character (any number of hex digits) + \r carriage return (\x0d) + \t tab (\x09) + \v vertical tab (\x0b) + \ddd octal number (up to 3 octal digits); represent a single + code point unless larger than 255 with the 8-bit library + \o{dd...} octal number (any number of octal digits} representing a + character in UTF mode or a code point + \xhh hexadecimal byte (up to 2 hex digits) + \x{hh...} hexadecimal number (up to 8 hex digits) representing a + character in UTF mode or a code point-The use of \x{hh...} is not dependent on the use of the utf modifier on -the pattern. It is recognized always. There may be any number of hexadecimal -digits inside the braces; invalid values provoke error messages. +Invoking \N{U+hh...} or \x{hh...} doesn't require the use of the utf +modifier on the pattern. It is always recognized. There may be any number of +hexadecimal digits inside the braces; invalid values provoke error messages +but when using \N{U+hh...} with some invalid unicode characters they will +be accepted with a warning instead.-Note that \xhh specifies one byte rather than one character in UTF-8 mode; -this makes it possible to construct invalid UTF-8 sequences for testing -purposes. On the other hand, \x{hh} is interpreted as a UTF-8 character in -UTF-8 mode, generating more than one byte if the value is greater than 127. -When testing the 8-bit library not in UTF-8 mode, \x{hh} generates one byte -for values less than 256, and causes an error for greater values. +Note that even in UTF-8 mode, \xhh (and depending of how large, \ddd) +describe one byte rather than one character; this makes it possible to +construct invalid UTF-8 sequences for testing purposes. On the other hand, +\x{hh...} is interpreted as a UTF-8 character in UTF-8 mode, only generating +more than one byte if the value is greater than 127. To avoid the ambiguity +it is preferred to use \N{U+hh...} when describing characters. When testing +the 8-bit library not in UTF-8 mode, \x{hh} generates one byte for values +that could fit on it, and causes an error for greater values.
-In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it -possible to construct invalid UTF-16 sequences for testing purposes. +When testing the 16-bit library, not in UTF-16 mode, all 4-digit \x{hhhh} +values are accepted. This makes it possible to construct invalid UTF-16 +sequences for testing purposes.
-In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This makes it -possible to construct invalid UTF-32 sequences for testing purposes. +When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \x{...} +values are accepted. This makes it possible to construct invalid UTF-32 +sequences for testing purposes.
There is a special backslash sequence that specifies replication of one or more @@ -625,6 +643,7 @@ for a description of the effects of these options. allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX + alt_extended_class set PCRE2_ALT_EXTENDED_CLASS alt_verbnames set PCRE2_ALT_VERBNAMES anchored set PCRE2_ANCHORED /a ascii_all set all ASCII options @@ -653,13 +672,17 @@ for a description of the effects of these options. match_word set PCRE2_EXTRA_MATCH_WORD /m multiline set PCRE2_MULTILINE never_backslash_c set PCRE2_NEVER_BACKSLASH_C + never_callout set PCRE2_EXTRA_NEVER_CALLOUT never_ucp set PCRE2_NEVER_UCP never_utf set PCRE2_NEVER_UTF /n no_auto_capture set PCRE2_NO_AUTO_CAPTURE no_auto_possess set PCRE2_NO_AUTO_POSSESS + no_bs0 set PCRE2_EXTRA_NO_BS0 no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR no_start_optimize set PCRE2_NO_START_OPTIMIZE no_utf_check set PCRE2_NO_UTF_CHECK + python_octal set PCRE2_EXTRA_PYTHON_OCTAL + turkish_casing set PCRE2_EXTRA_TURKISH_CASING ucp set PCRE2_UCP ungreedy set PCRE2_UNGREEDY use_offset_limit set PCRE2_USE_OFFSET_LIMIT @@ -671,6 +694,23 @@ notation. Otherwise, those less than 0x100 are output in hex without the curly brackets. Setting utf in 16-bit or 32-bit mode also causes pattern and subject strings to be translated to UTF-16 or UTF-32, respectively, before being passed to library functions. +
+
+The following modifiers enable or disable performance optimizations by +calling pcre2_set_optimize() before invoking the regex compiler. ++ optimization_full enable all optional optimizations + optimization_none disable all optional optimizations + auto_possess auto-possessify variable quantifiers + auto_possess_off don't auto-possessify variable quantifiers + dotstar_anchor anchor patterns starting with .* + dotstar_anchor_off don't anchor patterns starting with .* + start_optimize enable pre-scan of subject string + start_optimize_off disable pre-scan of subject string ++See the +pcre2_set_optimize +documentation for details on these optimizations.
Setting compilation controls @@ -680,14 +720,15 @@ The following modifiers affect the compilation process or request information about the pattern. There are single-letter abbreviations for some that are heavily used in the test files.- bsr=[anycrlf|unicode] specify \R handling /B bincode show binary code without lengths + bsr=[anycrlf|unicode] specify \R handling callout_info show callout information convert=<options> request foreign pattern conversion convert_glob_escape=c set glob escape character convert_glob_separator=c set glob separator character convert_length set convert buffer length debug same as info,fullbincode + expand expand repetition syntax in pattern framesize show matching frame size fullbincode show binary code with lengths /I info show info about compiled pattern @@ -709,6 +750,7 @@ heavily used in the test files. posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack pushcopy push a copy onto the stack + pushtablescopy push a copy with tables onto the stack stackguard=<number> test the stackguard feature subject_literal treat all subject lines as literal tables=[0|1|2|3] select internal tables @@ -1128,6 +1170,7 @@ process. replace=<string> specify a replacement string startchar show starting character when relevant substitute_callout use substitution callouts + substitute_case_callout use substitution case callouts substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED @@ -1217,10 +1260,11 @@ Setting match optionsThe following modifiers set options for pcre2_match() or pcre2_dfa_match(). See -pcreapi +pcre2api for a description of their effects.
anchored set PCRE2_ANCHORED + copy_matched_subject set PCRE2_COPY_MATCHED_SUBJECT endanchored set PCRE2_ENDANCHORED dfa_restart set PCRE2_DFA_RESTART dfa_shortest set PCRE2_DFA_SHORTEST @@ -1271,8 +1315,8 @@ pattern, but can be overridden by modifiers on the subject. aftertext show text after match allaftertext show text after captures allcaptures show all captures - allvector show the entire ovector allusedtext show all consulted text (non-JIT only) + allvector show the entire ovector altglobal alternative global matching callout_capture show captures at callout time callout_data=<n> set a value to pass via callouts @@ -1306,7 +1350,8 @@ pattern, but can be overridden by modifiers on the subject. startchar show startchar when relevant startoffset=<n> same as offset=<n> substitute_callout use substitution callouts - substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED + substitute_case_callout use substitution case callouts + substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_literal use PCRE2_SUBSTITUTE_LITERAL substitute_matched use PCRE2_SUBSTITUTE_MATCHED substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH @@ -1592,6 +1637,21 @@ If both are set for the same number, stop takes precedence. Only a single skip or stop is supported, which is sufficient for testing that the feature works.
+Testing substitute case callouts +
++If the substitute_case_callout modifier is set, a substitution +case callout function is set up. The callout function is called for each +substituted chunk which is to be case-transformed. +
++The callout function passed is a fixed function with implementation for certain +behaviours: inputs which shrink when case-transformed; inputs which grow; inputs +with distinct upper/lower/titlecase forms. The characters which are not +special-cased for testing purposes are left unmodified, as if they are caseless +characters. +
+
Setting the JIT stack size
@@ -2204,7 +2264,7 @@ Cambridge, England.
REVISION
-Last updated: 24 April 2024 +Last updated: 26 December 2024
Copyright © 1997-2024 University of Cambridge.
diff --git a/doc/html/pcre2unicode.html b/doc/html/pcre2unicode.html index 6f0972e..5b42532 100644 --- a/doc/html/pcre2unicode.html +++ b/doc/html/pcre2unicode.html @@ -53,7 +53,7 @@ When PCRE2 is built with Unicode support, the escape sequences \p{..}, The Unicode properties that can be tested are a subset of those that Perl supports. Currently they are limited to the general category properties such as Lu for an upper case letter or Nd for a decimal number, the derived properties -Any and LC (synonym L&), the Unicode script names such as Arabic or Han, +Any and Lc (synonym L&), the Unicode script names such as Arabic or Han, Bidi_Class, Bidi_Control, and a few binary properties.@@ -157,6 +157,40 @@ Recognition of these non-ASCII characters as case-equivalent to their ASCII counterparts can be disabled by setting the PCRE2_EXTRA_CASELESS_RESTRICT option. When this is set, all characters in a case equivalence must either be ASCII or non-ASCII; there can be no mixing. +
+ Without PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' = U+212A (Kelvin sign) + 's' = 'S' = U+017F (long S) + With PCRE2_EXTRA_CASELESS_RESTRICT: + 'k' = 'K' + U+212A (Kelvin sign) only case-equivalent to itself + 's' = 'S' + U+017F (long S) only case-equivalent to itself ++ ++One language family, Turkish and Azeri, has its own case-insensitivity rules, +which can be selected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the +behaviour of the 'i', 'I', U+0130 (capital I with dot above), and U+0131 +(small dotless i) characters. +
+ Without PCRE2_EXTRA_TURKISH_CASING: + 'i' = 'I' + U+0130 (capital I with dot above) only case-equivalent to itself + U+0131 (small dotless i) only case-equivalent to itself + With PCRE2_EXTRA_TURKISH_CASING: + 'i' = U+0130 (capital I with dot above) + U+0131 (small dotless i) = 'I' ++ ++It is not allowed to specify both PCRE2_EXTRA_CASELESS_RESTRICT and +PCRE2_EXTRA_TURKISH_CASING together. +
++From release 10.45 the Unicode letter properties Lu (upper case), Ll (lower +case), and Lt (title case) are all treated as Lc (cased letter) when caseless +matching is set by the PCRE2_CASELESS option or (?i) within the pattern.
SCRIPT RUNS @@ -513,9 +547,9 @@ Cambridge, England. REVISION
-Last updated: 12 October 2023 +Last updated: 27 November 2024
-Copyright © 1997-2023 University of Cambridge. +Copyright © 1997-2024 University of Cambridge.
Return to the PCRE2 index page. diff --git a/doc/index.html.src b/doc/index.html.src deleted file mode 100644 index e4dc786..0000000 --- a/doc/index.html.src +++ /dev/null @@ -1,318 +0,0 @@ - - -
-PCRE2 specification - - -Perl-compatible Regular Expressions (revised API: PCRE2)
--The HTML documentation for PCRE2 consists of a number of pages that are listed -below in alphabetical order. If you are new to PCRE2, please read the first one -first. -
- --
- -- - pcre2 -Introductory page - - pcre2-config -Information about the installation configuration - - pcre2api -PCRE2's native API - - pcre2build -Building PCRE2 - - pcre2callout -The callout facility - - pcre2compat -Compability with Perl - - pcre2convert -Experimental foreign pattern conversion functions - - pcre2demo -A demonstration C program that uses the PCRE2 library - - pcre2grep -The pcre2grep command - - pcre2jit -Discussion of the just-in-time optimization support - - pcre2limits -Details of size and other limits - - pcre2matching -Discussion of the two matching algorithms - - pcre2partial -Using PCRE2 for partial matching - - pcre2pattern -Specification of the regular expressions supported by PCRE2 - - pcre2perform -Some comments on performance - - pcre2posix -The POSIX API to the PCRE2 8-bit library - - pcre2sample -Discussion of the pcre2demo program - - pcre2serialize -Serializing functions for saving precompiled patterns - - pcre2syntax -Syntax quick-reference summary - - pcre2test -The pcre2test command for testing PCRE2 - pcre2unicode -Discussion of Unicode and UTF-8/UTF-16/UTF-32 support -There are also individual pages that summarize the interface for each function -in the library. -
- -
| pcre2_callout_enumerate | -Enumerate callouts in a compiled pattern |
| pcre2_code_copy | -Copy a compiled pattern |
| pcre2_code_copy_with_tables | -Copy a compiled pattern and its character tables |
| pcre2_code_free | -Free a compiled pattern |
| pcre2_compile | -Compile a regular expression pattern |
| pcre2_compile_context_copy | -Copy a compile context |
| pcre2_compile_context_create | -Create a compile context |
| pcre2_compile_context_free | -Free a compile context |
| pcre2_config | -Show build-time configuration options |
| pcre2_convert_context_copy | -Copy a convert context |
| pcre2_convert_context_create | -Create a convert context |
| pcre2_convert_context_free | -Free a convert context |
| pcre2_converted_pattern_free | -Free converted foreign pattern |
| pcre2_dfa_match | -Match a compiled pattern to a subject string - (DFA algorithm; not Perl compatible) |
| pcre2_general_context_copy | -Copy a general context |
| pcre2_general_context_create | -Create a general context |
| pcre2_general_context_free | -Free a general context |
| pcre2_get_error_message | -Get textual error message for error number |
| pcre2_get_mark | -Get a (*MARK) name |
| pcre2_get_match_data_size | -Get the size of a match data block |
| pcre2_get_ovector_count | -Get the ovector count |
| pcre2_get_ovector_pointer | -Get a pointer to the ovector |
| pcre2_get_startchar | -Get the starting character offset |
| pcre2_jit_compile | -Process a compiled pattern with the JIT compiler |
| pcre2_jit_free_unused_memory | -Free unused JIT memory |
| pcre2_jit_match | -Fast path interface to JIT matching |
| pcre2_jit_stack_assign | -Assign stack for JIT matching |
| pcre2_jit_stack_create | -Create a stack for JIT matching |
| pcre2_jit_stack_free | -Free a JIT matching stack |
| pcre2_maketables | -Build character tables in current locale |
| pcre2_maketables_free | -Free character tables |
| pcre2_match | -Match a compiled pattern to a subject string - (Perl compatible) |
| pcre2_match_context_copy | -Copy a match context |
| pcre2_match_context_create | -Create a match context |
| pcre2_match_context_free | -Free a match context |
| pcre2_match_data_create | -Create a match data block |
| pcre2_match_data_create_from_pattern | -Create a match data block getting size from pattern |
| pcre2_match_data_free | -Free a match data block |
| pcre2_pattern_convert | -Experimental foreign pattern converter |
| pcre2_pattern_info | -Extract information about a pattern |
| pcre2_serialize_decode | -Decode serialized compiled patterns |
| pcre2_serialize_encode | -Serialize compiled patterns for save/restore |
| pcre2_serialize_free | -Free serialized compiled patterns |
| pcre2_serialize_get_number_of_codes | -Get number of serialized compiled patterns |
| pcre2_set_bsr | -Set \R convention |
| pcre2_set_callout | -Set up a callout function |
| pcre2_set_character_tables | -Set character tables |
| pcre2_set_compile_extra_options | -Set compile time extra options |
| pcre2_set_compile_recursion_guard | -Set up a compile recursion guard function |
| pcre2_set_depth_limit | -Set the match backtracking depth limit |
| pcre2_set_glob_escape | -Set glob escape character |
| pcre2_set_glob_separator | -Set glob separator character |
| pcre2_set_heap_limit | -Set the match backtracking heap limit |
| pcre2_set_match_limit | -Set the match limit |
| pcre2_set_max_pattern_compiled_length | -Set the maximum length of a compiled pattern |
| pcre2_set_max_pattern_length | -Set the maximum length of a pattern |
| pcre2_set_max_varlookbehind | -Set the maximum match length for a variable-length lookbehind |
| pcre2_set_newline | -Set the newline convention |
| pcre2_set_offset_limit | -Set the offset limit |
| pcre2_set_parens_nest_limit | -Set the parentheses nesting limit |
| pcre2_set_recursion_limit | -Obsolete: use pcre2_set_depth_limit |
| pcre2_set_recursion_memory_management | -Obsolete function that (from 10.30 onwards) does nothing |
| pcre2_substitute | -Match a compiled pattern to a subject string and do - substitutions |
| pcre2_substring_copy_byname | -Extract named substring into given buffer |
| pcre2_substring_copy_bynumber | -Extract numbered substring into given buffer |
| pcre2_substring_free | -Free extracted substring |
| pcre2_substring_get_byname | -Extract named substring into new memory |
| pcre2_substring_get_bynumber | -Extract numbered substring into new memory |
| pcre2_substring_length_byname | -Find length of named substring |
| pcre2_substring_length_bynumber | -Find length of numbered substring |
| pcre2_substring_list_free | -Free list of extracted substrings |
| pcre2_substring_list_get | -Extract all substrings into new memory |
| pcre2_substring_nametable_scan | -Find table entries for given string name |
| pcre2_substring_number_from_name | -Convert captured string name to number |