Context Navigation

source: XMLIO_V2/external/src/POCO/Foundation.save/pcre_compile.c @ 80

Last change on this file since 80 was 80, checked in by ymipsl, 14 years ago
ajout lib externe
Property svn:eol-style set to `native`
File size: 198.1 KB

Rev	Line
[80]	1	/*************************************************
	2	* Perl-Compatible Regular Expressions *
	3	*************************************************/
	4
	5	/* PCRE is a library of functions to support regular expressions whose syntax
	6	and semantics are as close as possible to those of the Perl 5 language.
	7
	8	Written by Philip Hazel
	9	Copyright (c) 1997-2008 University of Cambridge
	10
	11	-----------------------------------------------------------------------------
	12	Redistribution and use in source and binary forms, with or without
	13	modification, are permitted provided that the following conditions are met:
	14
	15	* Redistributions of source code must retain the above copyright notice,
	16	this list of conditions and the following disclaimer.
	17
	18	* Redistributions in binary form must reproduce the above copyright
	19	notice, this list of conditions and the following disclaimer in the
	20	documentation and/or other materials provided with the distribution.
	21
	22	* Neither the name of the University of Cambridge nor the names of its
	23	contributors may be used to endorse or promote products derived from
	24	this software without specific prior written permission.
	25
	26	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	27	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	30	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	31	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	32	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	33	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	34	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	35	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	36	POSSIBILITY OF SUCH DAMAGE.
	37	-----------------------------------------------------------------------------
	38	*/
	39
	40
	41	/* This module contains the external function pcre_compile(), along with
	42	supporting internal functions that are not used by other modules. */
	43
	44
	45	#include "pcre_config.h"
	46
	47	#define NLBLOCK cd /* Block containing newline information */
	48	#define PSSTART start_pattern /* Field containing processed string start */
	49	#define PSEND end_pattern /* Field containing processed string end */
	50
	51	#include "pcre_internal.h"
	52
	53
	54	/* When DEBUG is defined, we need the pcre_printint() function, which is also
	55	used by pcretest. DEBUG is not defined when building a production library. */
	56
	57	#ifdef DEBUG
	58	#include "pcre_printint.src"
	59	#endif
	60
	61
	62	/* Macro for setting individual bits in class bitmaps. */
	63
	64	#define SETBIT(a,b) a[b/8] \|= (1 << (b%8))
	65
	66	/* Maximum length value to check against when making sure that the integer that
	67	holds the compiled pattern length does not overflow. We make it a bit less than
	68	INT_MAX to allow for adding in group terminating bytes, so that we don't have
	69	to check them every time. */
	70
	71	#define OFLOW_MAX (INT_MAX - 20)
	72
	73
	74	/*************************************************
	75	* Code parameters and static tables *
	76	*************************************************/
	77
	78	/* This value specifies the size of stack workspace that is used during the
	79	first pre-compile phase that determines how much memory is required. The regex
	80	is partly compiled into this space, but the compiled parts are discarded as
	81	soon as they can be, so that hopefully there will never be an overrun. The code
	82	does, however, check for an overrun. The largest amount I've seen used is 218,
	83	so this number is very generous.
	84
	85	The same workspace is used during the second, actual compile phase for
	86	remembering forward references to groups so that they can be filled in at the
	87	end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
	88	is 4 there is plenty of room. */
	89
	90	#define COMPILE_WORK_SIZE (4096)
	91
	92
	93	/* Table for handling escaped characters in the range '0'-'z'. Positive returns
	94	are simple data values; negative values are for special things like \d and so
	95	on. Zero means further processing is needed (for things like \x), or the escape
	96	is invalid. */
	97
	98	#ifndef EBCDIC /* This is the "normal" table for ASCII systems */
	99	static const short int escapes[] = {
	100	0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
	101	0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
	102	'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
	103	-ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
	104	-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
	105	-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
	106	'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
	107	-ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
	108	-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
	109	0, 0, -ESC_z /* x - z */
	110	};
	111
	112	#else /* This is the "abnormal" table for EBCDIC systems */
	113	static const short int escapes[] = {
	114	/* 48 */ 0, 0, 0, '.', '<', '(', '+', '\|',
	115	/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
	116	/* 58 / 0, 0, '!', '$', '', ')', ';', '~',
	117	/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
	118	/* 68 */ 0, 0, '\|', ',', '%', '_', '>', '?',
	119	/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
	120	/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
	121	/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
	122	/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
	123	/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
	124	/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
	125	/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
	126	/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
	127	/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
	128	/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
	129	/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
	130	/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
	131	/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
	132	/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
	133	/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
	134	/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
	135	/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
	136	/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
	137	};
	138	#endif
	139
	140
	141	/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
	142	searched linearly. Put all the names into a single string, in order to reduce
	143	the number of relocations when a shared library is dynamically linked. */
	144
	145	typedef struct verbitem {
	146	int len;
	147	int op;
	148	} verbitem;
	149
	150	static const char verbnames[] =
	151	"ACCEPT\0"
	152	"COMMIT\0"
	153	"F\0"
	154	"FAIL\0"
	155	"PRUNE\0"
	156	"SKIP\0"
	157	"THEN";
	158
	159	static const verbitem verbs[] = {
	160	{ 6, OP_ACCEPT },
	161	{ 6, OP_COMMIT },
	162	{ 1, OP_FAIL },
	163	{ 4, OP_FAIL },
	164	{ 5, OP_PRUNE },
	165	{ 4, OP_SKIP },
	166	{ 4, OP_THEN }
	167	};
	168
	169	static const int verbcount = sizeof(verbs)/sizeof(verbitem);
	170
	171
	172	/* Tables of names of POSIX character classes and their lengths. The names are
	173	now all in a single string, to reduce the number of relocations when a shared
	174	library is dynamically loaded. The list of lengths is terminated by a zero
	175	length entry. The first three must be alpha, lower, upper, as this is assumed
	176	for handling case independence. */
	177
	178	static const char posix_names[] =
	179	"alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
	180	"cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
	181	"word\0" "xdigit";
	182
	183	static const uschar posix_name_lengths[] = {
	184	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
	185
	186	/* Table of class bit maps for each POSIX class. Each class is formed from a
	187	base map, with an optional addition or removal of another map. Then, for some
	188	classes, there is some additional tweaking: for [:blank:] the vertical space
	189	characters are removed, and for [:alpha:] and [:alnum:] the underscore
	190	character is removed. The triples in the table consist of the base map offset,
	191	second map offset or -1 if no second map, and a non-negative value for map
	192	addition or a negative value for map subtraction (if there are two maps). The
	193	absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
	194	remove vertical space characters, 2 => remove underscore. */
	195
	196	static const int posix_class_maps[] = {
	197	cbit_word, cbit_digit, -2, /* alpha */
	198	cbit_lower, -1, 0, /* lower */
	199	cbit_upper, -1, 0, /* upper */
	200	cbit_word, -1, 2, /* alnum - word without underscore */
	201	cbit_print, cbit_cntrl, 0, /* ascii */
	202	cbit_space, -1, 1, /* blank - a GNU extension */
	203	cbit_cntrl, -1, 0, /* cntrl */
	204	cbit_digit, -1, 0, /* digit */
	205	cbit_graph, -1, 0, /* graph */
	206	cbit_print, -1, 0, /* print */
	207	cbit_punct, -1, 0, /* punct */
	208	cbit_space, -1, 0, /* space */
	209	cbit_word, -1, 0, /* word - a Perl extension */
	210	cbit_xdigit,-1, 0 /* xdigit */
	211	};
	212
	213
	214	#define STRING(a) # a
	215	#define XSTRING(s) STRING(s)
	216
	217	/* The texts of compile-time error messages. These are "char *" because they
	218	are passed to the outside world. Do not ever re-use any error number, because
	219	they are documented. Always add a new error instead. Messages marked DEAD below
	220	are no longer used. This used to be a table of strings, but in order to reduce
	221	the number of relocations needed when a shared library is loaded dynamically,
	222	it is now one long string. We cannot use a table of offsets, because the
	223	lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
	224	simply count through to the one we want - this isn't a performance issue
	225	because these strings are used only when there is a compilation error. */
	226
	227	static const char error_texts[] =
	228	"no error\0"
	229	"\\ at end of pattern\0"
	230	"\\c at end of pattern\0"
	231	"unrecognized character follows \\\0"
	232	"numbers out of order in {} quantifier\0"
	233	/* 5 */
	234	"number too big in {} quantifier\0"
	235	"missing terminating ] for character class\0"
	236	"invalid escape sequence in character class\0"
	237	"range out of order in character class\0"
	238	"nothing to repeat\0"
	239	/* 10 */
	240	"operand of unlimited repeat could match the empty string\0" / DEAD /
	241	"internal error: unexpected repeat\0"
	242	"unrecognized character after (? or (?-\0"
	243	"POSIX named classes are supported only within a class\0"
	244	"missing )\0"
	245	/* 15 */
	246	"reference to non-existent subpattern\0"
	247	"erroffset passed as NULL\0"
	248	"unknown option bit(s) set\0"
	249	"missing ) after comment\0"
	250	"parentheses nested too deeply\0" / DEAD /
	251	/* 20 */
	252	"regular expression is too large\0"
	253	"failed to get memory\0"
	254	"unmatched parentheses\0"
	255	"internal error: code overflow\0"
	256	"unrecognized character after (?<\0"
	257	/* 25 */
	258	"lookbehind assertion is not fixed length\0"
	259	"malformed number or name after (?(\0"
	260	"conditional group contains more than two branches\0"
	261	"assertion expected after (?(\0"
	262	"(?R or (?[+-]digits must be followed by )\0"
	263	/* 30 */
	264	"unknown POSIX class name\0"
	265	"POSIX collating elements are not supported\0"
	266	"this version of PCRE is not compiled with PCRE_UTF8 support\0"
	267	"spare error\0" / DEAD /
	268	"character value in \\x{...} sequence is too large\0"
	269	/* 35 */
	270	"invalid condition (?(0)\0"
	271	"\\C not allowed in lookbehind assertion\0"
	272	"PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
	273	"number after (?C is > 255\0"
	274	"closing ) for (?C expected\0"
	275	/* 40 */
	276	"recursive call could loop indefinitely\0"
	277	"unrecognized character after (?P\0"
	278	"syntax error in subpattern name (missing terminator)\0"
	279	"two named subpatterns have the same name\0"
	280	"invalid UTF-8 string\0"
	281	/* 45 */
	282	"support for \\P, \\p, and \\X has not been compiled\0"
	283	"malformed \\P or \\p sequence\0"
	284	"unknown property name after \\P or \\p\0"
	285	"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
	286	"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
	287	/* 50 */
	288	"repeated subpattern is too long\0" / DEAD /
	289	"octal value is greater than \\377 (not in UTF-8 mode)\0"
	290	"internal error: overran compiling workspace\0"
	291	"internal error: previously-checked referenced subpattern not found\0"
	292	"DEFINE group contains more than one branch\0"
	293	/* 55 */
	294	"repeating a DEFINE group is not allowed\0"
	295	"inconsistent NEWLINE options\0"
	296	"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
	297	"a numbered reference must not be zero\0"
	298	"(*VERB) with an argument is not supported\0"
	299	/* 60 */
	300	"(*VERB) not recognized\0"
	301	"number is too big\0"
	302	"subpattern name expected\0"
	303	"digit expected after (?+\0"
	304	"] is an invalid data character in JavaScript compatibility mode";
	305
	306
	307	/* Table to identify digits and hex digits. This is used when compiling
	308	patterns. Note that the tables in chartables are dependent on the locale, and
	309	may mark arbitrary characters as digits - but the PCRE compiling code expects
	310	to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
	311	a private table here. It costs 256 bytes, but it is a lot faster than doing
	312	character value tests (at least in some simple cases I timed), and in some
	313	applications one wants PCRE to compile efficiently as well as match
	314	efficiently.
	315
	316	For convenience, we use the same bit definitions as in chartables:
	317
	318	0x04 decimal digit
	319	0x08 hexadecimal digit
	320
	321	Then we can use ctype_digit and ctype_xdigit in the code. */
	322
	323	#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
	324	static const unsigned char digitab[] =
	325	{
	326	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
	327	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
	328	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
	329	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
	330	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
	331	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
	332	0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
	333	0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
	334	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
	335	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
	336	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
	337	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
	338	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
	339	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
	340	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
	341	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
	342	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
	343	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
	344	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
	345	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
	346	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
	347	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
	348	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
	349	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
	350	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
	351	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
	352	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
	353	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
	354	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
	355	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
	356	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
	357	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
	358
	359	#else /* This is the "abnormal" case, for EBCDIC systems */
	360	static const unsigned char digitab[] =
	361	{
	362	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
	363	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
	364	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
	365	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
	366	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
	367	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
	368	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
	369	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
	370	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
	371	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- \| */
	372	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
	373	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
	374	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
	375	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
	376	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
	377	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
	378	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
	379	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
	380	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
	381	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
	382	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
	383	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
	384	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
	385	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
	386	0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
	387	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
	388	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
	389	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
	390	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
	391	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
	392	0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
	393	0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
	394
	395	static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
	396	0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
	397	0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
	398	0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
	399	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
	400	0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
	401	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
	402	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
	403	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
	404	0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
	405	0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- \| */
	406	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
	407	0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
	408	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
	409	0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
	410	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
	411	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
	412	0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
	413	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
	414	0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
	415	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
	416	0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
	417	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
	418	0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
	419	0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
	420	0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
	421	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
	422	0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
	423	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
	424	0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
	425	0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
	426	0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
	427	0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
	428	#endif
	429
	430
	431	/* Definition to allow mutual recursion */
	432
	433	static BOOL
	434	compile_regex(int, int, uschar , const uschar , int *, BOOL, BOOL, int,
	435	int , int , branch_chain , compile_data , int *);
	436
	437
	438
	439	/*************************************************
	440	* Find an error text *
	441	*************************************************/
	442
	443	/* The error texts are now all in one long string, to save on relocations. As
	444	some of the text is of unknown length, we can't use a table of offsets.
	445	Instead, just count through the strings. This is not a performance issue
	446	because it happens only when there has been a compilation error.
	447
	448	Argument: the error number
	449	Returns: pointer to the error string
	450	*/
	451
	452	static const char *
	453	find_error_text(int n)
	454	{
	455	const char *s = error_texts;
	456	for (; n > 0; n--) while (*s++ != 0) {};
	457	return s;
	458	}
	459
	460
	461	/*************************************************
	462	* Handle escapes *
	463	*************************************************/
	464
	465	/* This function is called when a \ has been encountered. It either returns a
	466	positive value for a simple escape such as \n, or a negative value which
	467	encodes one of the more complicated things such as \d. A backreference to group
	468	n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
	469	UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
	470	ptr is pointing at the \. On exit, it is on the final character of the escape
	471	sequence.
	472
	473	Arguments:
	474	ptrptr points to the pattern position pointer
	475	errorcodeptr points to the errorcode variable
	476	bracount number of previous extracting brackets
	477	options the options bits
	478	isclass TRUE if inside a character class
	479
	480	Returns: zero or positive => a data character
	481	negative => a special escape sequence
	482	on error, errorcodeptr is set
	483	*/
	484
	485	static int
	486	check_escape(const uschar *ptrptr, int errorcodeptr, int bracount,
	487	int options, BOOL isclass)
	488	{
	489	BOOL utf8 = (options & PCRE_UTF8) != 0;
	490	const uschar ptr = ptrptr + 1;
	491	int c, i;
	492
	493	GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
	494	ptr--; /* Set pointer back to the last byte */
	495
	496	/* If backslash is at the end of the pattern, it's an error. */
	497
	498	if (c == 0) *errorcodeptr = ERR1;
	499
	500	/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
	501	in a table. A non-zero result is something that can be returned immediately.
	502	Otherwise further processing may be required. */
	503
	504	#ifndef EBCDIC /* ASCII coding */
	505	else if (c < '0' \|\| c > 'z') {} /* Not alphanumeric */
	506	else if ((i = escapes[c - '0']) != 0) c = i;
	507
	508	#else /* EBCDIC coding */
	509	else if (c < 'a' \|\| (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
	510	else if ((i = escapes[c - 0x48]) != 0) c = i;
	511	#endif
	512
	513	/* Escapes that need further processing, or are illegal. */
	514
	515	else
	516	{
	517	const uschar *oldptr;
	518	BOOL braced, negated;
	519
	520	switch (c)
	521	{
	522	/* A number of Perl escapes are not handled by PCRE. We give an explicit
	523	error. */
	524
	525	case 'l':
	526	case 'L':
	527	case 'N':
	528	case 'u':
	529	case 'U':
	530	*errorcodeptr = ERR37;
	531	break;
	532
	533	/* \g must be followed by one of a number of specific things:
	534
	535	(1) A number, either plain or braced. If positive, it is an absolute
	536	backreference. If negative, it is a relative backreference. This is a Perl
	537	5.10 feature.
	538
	539	(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
	540	is part of Perl's movement towards a unified syntax for back references. As
	541	this is synonymous with \k{name}, we fudge it up by pretending it really
	542	was \k.
	543
	544	(3) For Oniguruma compatibility we also support \g followed by a name or a
	545	number either in angle brackets or in single quotes. However, these are
	546	(possibly recursive) subroutine calls, _not_ backreferences. Just return
	547	the -ESC_g code (cf \k). */
	548
	549	case 'g':
	550	if (ptr[1] == '<' \|\| ptr[1] == '\'')
	551	{
	552	c = -ESC_g;
	553	break;
	554	}
	555
	556	/* Handle the Perl-compatible cases */
	557
	558	if (ptr[1] == '{')
	559	{
	560	const uschar *p;
	561	for (p = ptr+2; p != 0 && p != '}'; p++)
	562	if (p != '-' && (digitab[p] & ctype_digit) == 0) break;
	563	if (p != 0 && p != '}')
	564	{
	565	c = -ESC_k;
	566	break;
	567	}
	568	braced = TRUE;
	569	ptr++;
	570	}
	571	else braced = FALSE;
	572
	573	if (ptr[1] == '-')
	574	{
	575	negated = TRUE;
	576	ptr++;
	577	}
	578	else negated = FALSE;
	579
	580	c = 0;
	581	while ((digitab[ptr[1]] & ctype_digit) != 0)
	582	c = c * 10 + *(++ptr) - '0';
	583
	584	if (c < 0) /* Integer overflow */
	585	{
	586	*errorcodeptr = ERR61;
	587	break;
	588	}
	589
	590	if (braced && *(++ptr) != '}')
	591	{
	592	*errorcodeptr = ERR57;
	593	break;
	594	}
	595
	596	if (c == 0)
	597	{
	598	*errorcodeptr = ERR58;
	599	break;
	600	}
	601
	602	if (negated)
	603	{
	604	if (c > bracount)
	605	{
	606	*errorcodeptr = ERR15;
	607	break;
	608	}
	609	c = bracount - (c - 1);
	610	}
	611
	612	c = -(ESC_REF + c);
	613	break;
	614
	615	/* The handling of escape sequences consisting of a string of digits
	616	starting with one that is not zero is not straightforward. By experiment,
	617	the way Perl works seems to be as follows:
	618
	619	Outside a character class, the digits are read as a decimal number. If the
	620	number is less than 10, or if there are that many previous extracting
	621	left brackets, then it is a back reference. Otherwise, up to three octal
	622	digits are read to form an escaped byte. Thus \123 is likely to be octal
	623	123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
	624	value is greater than 377, the least significant 8 bits are taken. Inside a
	625	character class, \ followed by a digit is always an octal number. */
	626
	627	case '1': case '2': case '3': case '4': case '5':
	628	case '6': case '7': case '8': case '9':
	629
	630	if (!isclass)
	631	{
	632	oldptr = ptr;
	633	c -= '0';
	634	while ((digitab[ptr[1]] & ctype_digit) != 0)
	635	c = c * 10 + *(++ptr) - '0';
	636	if (c < 0) /* Integer overflow */
	637	{
	638	*errorcodeptr = ERR61;
	639	break;
	640	}
	641	if (c < 10 \|\| c <= bracount)
	642	{
	643	c = -(ESC_REF + c);
	644	break;
	645	}
	646	ptr = oldptr; /* Put the pointer back and fall through */
	647	}
	648
	649	/* Handle an octal number following \. If the first digit is 8 or 9, Perl
	650	generates a binary zero byte and treats the digit as a following literal.
	651	Thus we have to pull back the pointer by one. */
	652
	653	if ((c = *ptr) >= '8')
	654	{
	655	ptr--;
	656	c = 0;
	657	break;
	658	}
	659
	660	/* \0 always starts an octal number, but we may drop through to here with a
	661	larger first octal digit. The original code used just to take the least
	662	significant 8 bits of octal numbers (I think this is what early Perls used
	663	to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
	664	than 3 octal digits. */
	665
	666	case '0':
	667	c -= '0';
	668	while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
	669	c = c * 8 + *(++ptr) - '0';
	670	if (!utf8 && c > 255) *errorcodeptr = ERR51;
	671	break;
	672
	673	/* \x is complicated. \x{ddd} is a character number which can be greater
	674	than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
	675	treated as a data character. */
	676
	677	case 'x':
	678	if (ptr[1] == '{')
	679	{
	680	const uschar *pt = ptr + 2;
	681	int count = 0;
	682
	683	c = 0;
	684	while ((digitab[*pt] & ctype_xdigit) != 0)
	685	{
	686	register int cc = *pt++;
	687	if (c == 0 && cc == '0') continue; /* Leading zeroes */
	688	count++;
	689
	690	#ifndef EBCDIC /* ASCII coding */
	691	if (cc >= 'a') cc -= 32; /* Convert to upper case */
	692	c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
	693	#else /* EBCDIC coding */
	694	if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
	695	c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
	696	#endif
	697	}
	698
	699	if (*pt == '}')
	700	{
	701	if (c < 0 \|\| count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
	702	ptr = pt;
	703	break;
	704	}
	705
	706	/* If the sequence of hex digits does not end with '}', then we don't
	707	recognize this construct; fall through to the normal \x handling. */
	708	}
	709
	710	/* Read just a single-byte hex-defined char */
	711
	712	c = 0;
	713	while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
	714	{
	715	int cc; /* Some compilers don't like ++ */
	716	cc = (++ptr); / in initializers */
	717	#ifndef EBCDIC /* ASCII coding */
	718	if (cc >= 'a') cc -= 32; /* Convert to upper case */
	719	c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
	720	#else /* EBCDIC coding */
	721	if (cc <= 'z') cc += 64; /* Convert to upper case */
	722	c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
	723	#endif
	724	}
	725	break;
	726
	727	/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
	728	This coding is ASCII-specific, but then the whole concept of \cx is
	729	ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
	730
	731	case 'c':
	732	c = *(++ptr);
	733	if (c == 0)
	734	{
	735	*errorcodeptr = ERR2;
	736	break;
	737	}
	738
	739	#ifndef EBCDIC /* ASCII coding */
	740	if (c >= 'a' && c <= 'z') c -= 32;
	741	c ^= 0x40;
	742	#else /* EBCDIC coding */
	743	if (c >= 'a' && c <= 'z') c += 64;
	744	c ^= 0xC0;
	745	#endif
	746	break;
	747
	748	/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
	749	other alphanumeric following \ is an error if PCRE_EXTRA was set;
	750	otherwise, for Perl compatibility, it is a literal. This code looks a bit
	751	odd, but there used to be some cases other than the default, and there may
	752	be again in future, so I haven't "optimized" it. */
	753
	754	default:
	755	if ((options & PCRE_EXTRA) != 0) switch(c)
	756	{
	757	default:
	758	*errorcodeptr = ERR3;
	759	break;
	760	}
	761	break;
	762	}
	763	}
	764
	765	*ptrptr = ptr;
	766	return c;
	767	}
	768
	769
	770
	771	#ifdef SUPPORT_UCP
	772	/*************************************************
	773	* Handle \P and \p *
	774	*************************************************/
	775
	776	/* This function is called after \P or \p has been encountered, provided that
	777	PCRE is compiled with support for Unicode properties. On entry, ptrptr is
	778	pointing at the P or p. On exit, it is pointing at the final character of the
	779	escape sequence.
	780
	781	Argument:
	782	ptrptr points to the pattern position pointer
	783	negptr points to a boolean that is set TRUE for negation else FALSE
	784	dptr points to an int that is set to the detailed property value
	785	errorcodeptr points to the error code variable
	786
	787	Returns: type value from ucp_type_table, or -1 for an invalid type
	788	*/
	789
	790	static int
	791	get_ucp(const uschar *ptrptr, BOOL negptr, int dptr, int errorcodeptr)
	792	{
	793	int c, i, bot, top;
	794	const uschar ptr = ptrptr;
	795	char name[32];
	796
	797	c = *(++ptr);
	798	if (c == 0) goto ERROR_RETURN;
	799
	800	*negptr = FALSE;
	801
	802	/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
	803	negation. */
	804
	805	if (c == '{')
	806	{
	807	if (ptr[1] == '^')
	808	{
	809	*negptr = TRUE;
	810	ptr++;
	811	}
	812	for (i = 0; i < (int)sizeof(name) - 1; i++)
	813	{
	814	c = *(++ptr);
	815	if (c == 0) goto ERROR_RETURN;
	816	if (c == '}') break;
	817	name[i] = c;
	818	}
	819	if (c !='}') goto ERROR_RETURN;
	820	name[i] = 0;
	821	}
	822
	823	/* Otherwise there is just one following character */
	824
	825	else
	826	{
	827	name[0] = c;
	828	name[1] = 0;
	829	}
	830
	831	*ptrptr = ptr;
	832
	833	/* Search for a recognized property name using binary chop */
	834
	835	bot = 0;
	836	top = _pcre_utt_size;
	837
	838	while (bot < top)
	839	{
	840	i = (bot + top) >> 1;
	841	c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
	842	if (c == 0)
	843	{
	844	*dptr = _pcre_utt[i].value;
	845	return _pcre_utt[i].type;
	846	}
	847	if (c > 0) bot = i + 1; else top = i;
	848	}
	849
	850	*errorcodeptr = ERR47;
	851	*ptrptr = ptr;
	852	return -1;
	853
	854	ERROR_RETURN:
	855	*errorcodeptr = ERR46;
	856	*ptrptr = ptr;
	857	return -1;
	858	}
	859	#endif
	860
	861
	862
	863
	864	/*************************************************
	865	* Check for counted repeat *
	866	*************************************************/
	867
	868	/* This function is called when a '{' is encountered in a place where it might
	869	start a quantifier. It looks ahead to see if it really is a quantifier or not.
	870	It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
	871	where the ddds are digits.
	872
	873	Arguments:
	874	p pointer to the first char after '{'
	875
	876	Returns: TRUE or FALSE
	877	*/
	878
	879	static BOOL
	880	is_counted_repeat(const uschar *p)
	881	{
	882	if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
	883	while ((digitab[*p] & ctype_digit) != 0) p++;
	884	if (*p == '}') return TRUE;
	885
	886	if (*p++ != ',') return FALSE;
	887	if (*p == '}') return TRUE;
	888
	889	if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
	890	while ((digitab[*p] & ctype_digit) != 0) p++;
	891
	892	return (*p == '}');
	893	}
	894
	895
	896
	897	/*************************************************
	898	* Read repeat counts *
	899	*************************************************/
	900
	901	/* Read an item of the form {n,m} and return the values. This is called only
	902	after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
	903	so the syntax is guaranteed to be correct, but we need to check the values.
	904
	905	Arguments:
	906	p pointer to first char after '{'
	907	minp pointer to int for min
	908	maxp pointer to int for max
	909	returned as -1 if no max
	910	errorcodeptr points to error code variable
	911
	912	Returns: pointer to '}' on success;
	913	current ptr on error, with errorcodeptr set non-zero
	914	*/
	915
	916	static const uschar *
	917	read_repeat_counts(const uschar p, int minp, int maxp, int errorcodeptr)
	918	{
	919	int min = 0;
	920	int max = -1;
	921
	922	/* Read the minimum value and do a paranoid check: a negative value indicates
	923	an integer overflow. */
	924
	925	while ((digitab[p] & ctype_digit) != 0) min = min 10 + *p++ - '0';
	926	if (min < 0 \|\| min > 65535)
	927	{
	928	*errorcodeptr = ERR5;
	929	return p;
	930	}
	931
	932	/* Read the maximum value if there is one, and again do a paranoid on its size.
	933	Also, max must not be less than min. */
	934
	935	if (*p == '}') max = min; else
	936	{
	937	if (*(++p) != '}')
	938	{
	939	max = 0;
	940	while((digitab[p] & ctype_digit) != 0) max = max 10 + *p++ - '0';
	941	if (max < 0 \|\| max > 65535)
	942	{
	943	*errorcodeptr = ERR5;
	944	return p;
	945	}
	946	if (max < min)
	947	{
	948	*errorcodeptr = ERR4;
	949	return p;
	950	}
	951	}
	952	}
	953
	954	/* Fill in the required variables, and pass back the pointer to the terminating
	955	'}'. */
	956
	957	*minp = min;
	958	*maxp = max;
	959	return p;
	960	}
	961
	962
	963
	964	/*************************************************
	965	* Find forward referenced subpattern *
	966	*************************************************/
	967
	968	/* This function scans along a pattern's text looking for capturing
	969	subpatterns, and counting them. If it finds a named pattern that matches the
	970	name it is given, it returns its number. Alternatively, if the name is NULL, it
	971	returns when it reaches a given numbered subpattern. This is used for forward
	972	references to subpatterns. We know that if (?P< is encountered, the name will
	973	be terminated by '>' because that is checked in the first pass.
	974
	975	Arguments:
	976	ptr current position in the pattern
	977	cd compile background data
	978	name name to seek, or NULL if seeking a numbered subpattern
	979	lorn name length, or subpattern number if name is NULL
	980	xmode TRUE if we are in /x mode
	981
	982	Returns: the number of the named subpattern, or -1 if not found
	983	*/
	984
	985	static int
	986	find_parens(const uschar ptr, compile_data cd, const uschar *name, int lorn,
	987	BOOL xmode)
	988	{
	989	const uschar *thisname;
	990	int count = cd->bracount;
	991
	992	for (; *ptr != 0; ptr++)
	993	{
	994	int term;
	995
	996	/* Skip over backslashed characters and also entire \Q...\E */
	997
	998	if (*ptr == '\\')
	999	{
	1000	if (*(++ptr) == 0) return -1;
	1001	if (*ptr == 'Q') for (;;)
	1002	{
	1003	while ((++ptr) != 0 && ptr != '\\') {};
	1004	if (*ptr == 0) return -1;
	1005	if (*(++ptr) == 'E') break;
	1006	}
	1007	continue;
	1008	}
	1009
	1010	/* Skip over character classes; this logic must be similar to the way they
	1011	are handled for real. If the first character is '^', skip it. Also, if the
	1012	first few characters (either before or after ^) are \Q\E or \E we skip them
	1013	too. This makes for compatibility with Perl. */
	1014
	1015	if (*ptr == '[')
	1016	{
	1017	BOOL negate_class = FALSE;
	1018	for (;;)
	1019	{
	1020	int c = *(++ptr);
	1021	if (c == '\\')
	1022	{
	1023	if (ptr[1] == 'E') ptr++;
	1024	else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
	1025	else break;
	1026	}
	1027	else if (!negate_class && c == '^')
	1028	negate_class = TRUE;
	1029	else break;
	1030	}
	1031
	1032	/* If the next character is ']', it is a data character that must be
	1033	skipped, except in JavaScript compatibility mode. */
	1034
	1035	if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
	1036	ptr++;
	1037
	1038	while (*(++ptr) != ']')
	1039	{
	1040	if (*ptr == 0) return -1;
	1041	if (*ptr == '\\')
	1042	{
	1043	if (*(++ptr) == 0) return -1;
	1044	if (*ptr == 'Q') for (;;)
	1045	{
	1046	while ((++ptr) != 0 && ptr != '\\') {};
	1047	if (*ptr == 0) return -1;
	1048	if (*(++ptr) == 'E') break;
	1049	}
	1050	continue;
	1051	}
	1052	}
	1053	continue;
	1054	}
	1055
	1056	/* Skip comments in /x mode */
	1057
	1058	if (xmode && *ptr == '#')
	1059	{
	1060	while ((++ptr) != 0 && ptr != '\n') {};
	1061	if (*ptr == 0) return -1;
	1062	continue;
	1063	}
	1064
	1065	/* An opening parens must now be a real metacharacter */
	1066
	1067	if (*ptr != '(') continue;
	1068	if (ptr[1] != '?' && ptr[1] != '*')
	1069	{
	1070	count++;
	1071	if (name == NULL && count == lorn) return count;
	1072	continue;
	1073	}
	1074
	1075	ptr += 2;
	1076	if (ptr == 'P') ptr++; / Allow optional P */
	1077
	1078	/* We have to disambiguate (?<! and (?<= from (?<name> */
	1079
	1080	if ((*ptr != '<' \|\| ptr[1] == '!' \|\| ptr[1] == '=') &&
	1081	*ptr != '\'')
	1082	continue;
	1083
	1084	count++;
	1085
	1086	if (name == NULL && count == lorn) return count;
	1087	term = *ptr++;
	1088	if (term == '<') term = '>';
	1089	thisname = ptr;
	1090	while (*ptr != term) ptr++;
	1091	if (name != NULL && lorn == ptr - thisname &&
	1092	strncmp((const char )name, (const char )thisname, lorn) == 0)
	1093	return count;
	1094	}
	1095
	1096	return -1;
	1097	}
	1098
	1099
	1100
	1101	/*************************************************
	1102	* Find first significant op code *
	1103	*************************************************/
	1104
	1105	/* This is called by several functions that scan a compiled expression looking
	1106	for a fixed first character, or an anchoring op code etc. It skips over things
	1107	that do not influence this. For some calls, a change of option is important.
	1108	For some calls, it makes sense to skip negative forward and all backward
	1109	assertions, and also the \b assertion; for others it does not.
	1110
	1111	Arguments:
	1112	code pointer to the start of the group
	1113	options pointer to external options
	1114	optbit the option bit whose changing is significant, or
	1115	zero if none are
	1116	skipassert TRUE if certain assertions are to be skipped
	1117
	1118	Returns: pointer to the first significant opcode
	1119	*/
	1120
	1121	static const uschar*
	1122	first_significant_code(const uschar code, int options, int optbit,
	1123	BOOL skipassert)
	1124	{
	1125	for (;;)
	1126	{
	1127	switch ((int)*code)
	1128	{
	1129	case OP_OPT:
	1130	if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
	1131	*options = (int)code[1];
	1132	code += 2;
	1133	break;
	1134
	1135	case OP_ASSERT_NOT:
	1136	case OP_ASSERTBACK:
	1137	case OP_ASSERTBACK_NOT:
	1138	if (!skipassert) return code;
	1139	do code += GET(code, 1); while (*code == OP_ALT);
	1140	code += _pcre_OP_lengths[*code];
	1141	break;
	1142
	1143	case OP_WORD_BOUNDARY:
	1144	case OP_NOT_WORD_BOUNDARY:
	1145	if (!skipassert) return code;
	1146	/* Fall through */
	1147
	1148	case OP_CALLOUT:
	1149	case OP_CREF:
	1150	case OP_RREF:
	1151	case OP_DEF:
	1152	code += _pcre_OP_lengths[*code];
	1153	break;
	1154
	1155	default:
	1156	return code;
	1157	}
	1158	}
	1159	/* Control never reaches here */
	1160	}
	1161
	1162
	1163
	1164
	1165	/*************************************************
	1166	* Find the fixed length of a pattern *
	1167	*************************************************/
	1168
	1169	/* Scan a pattern and compute the fixed length of subject that will match it,
	1170	if the length is fixed. This is needed for dealing with backward assertions.
	1171	In UTF8 mode, the result is in characters rather than bytes.
	1172
	1173	Arguments:
	1174	code points to the start of the pattern (the bracket)
	1175	options the compiling options
	1176
	1177	Returns: the fixed length, or -1 if there is no fixed length,
	1178	or -2 if \C was encountered
	1179	*/
	1180
	1181	static int
	1182	find_fixedlength(uschar *code, int options)
	1183	{
	1184	int length = -1;
	1185
	1186	register int branchlength = 0;
	1187	register uschar *cc = code + 1 + LINK_SIZE;
	1188
	1189	/* Scan along the opcodes for this branch. If we get to the end of the
	1190	branch, check the length against that of the other branches. */
	1191
	1192	for (;;)
	1193	{
	1194	int d;
	1195	register int op = *cc;
	1196	switch (op)
	1197	{
	1198	case OP_CBRA:
	1199	case OP_BRA:
	1200	case OP_ONCE:
	1201	case OP_COND:
	1202	d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
	1203	if (d < 0) return d;
	1204	branchlength += d;
	1205	do cc += GET(cc, 1); while (*cc == OP_ALT);
	1206	cc += 1 + LINK_SIZE;
	1207	break;
	1208
	1209	/* Reached end of a branch; if it's a ket it is the end of a nested
	1210	call. If it's ALT it is an alternation in a nested call. If it is
	1211	END it's the end of the outer call. All can be handled by the same code. */
	1212
	1213	case OP_ALT:
	1214	case OP_KET:
	1215	case OP_KETRMAX:
	1216	case OP_KETRMIN:
	1217	case OP_END:
	1218	if (length < 0) length = branchlength;
	1219	else if (length != branchlength) return -1;
	1220	if (*cc != OP_ALT) return length;
	1221	cc += 1 + LINK_SIZE;
	1222	branchlength = 0;
	1223	break;
	1224
	1225	/* Skip over assertive subpatterns */
	1226
	1227	case OP_ASSERT:
	1228	case OP_ASSERT_NOT:
	1229	case OP_ASSERTBACK:
	1230	case OP_ASSERTBACK_NOT:
	1231	do cc += GET(cc, 1); while (*cc == OP_ALT);
	1232	/* Fall through */
	1233
	1234	/* Skip over things that don't match chars */
	1235
	1236	case OP_REVERSE:
	1237	case OP_CREF:
	1238	case OP_RREF:
	1239	case OP_DEF:
	1240	case OP_OPT:
	1241	case OP_CALLOUT:
	1242	case OP_SOD:
	1243	case OP_SOM:
	1244	case OP_EOD:
	1245	case OP_EODN:
	1246	case OP_CIRC:
	1247	case OP_DOLL:
	1248	case OP_NOT_WORD_BOUNDARY:
	1249	case OP_WORD_BOUNDARY:
	1250	cc += _pcre_OP_lengths[*cc];
	1251	break;
	1252
	1253	/* Handle literal characters */
	1254
	1255	case OP_CHAR:
	1256	case OP_CHARNC:
	1257	case OP_NOT:
	1258	branchlength++;
	1259	cc += 2;
	1260	#ifdef SUPPORT_UTF8
	1261	if ((options & PCRE_UTF8) != 0)
	1262	{
	1263	while ((*cc & 0xc0) == 0x80) cc++;
	1264	}
	1265	#endif
	1266	break;
	1267
	1268	/* Handle exact repetitions. The count is already in characters, but we
	1269	need to skip over a multibyte character in UTF8 mode. */
	1270
	1271	case OP_EXACT:
	1272	branchlength += GET2(cc,1);
	1273	cc += 4;
	1274	#ifdef SUPPORT_UTF8
	1275	if ((options & PCRE_UTF8) != 0)
	1276	{
	1277	while((*cc & 0x80) == 0x80) cc++;
	1278	}
	1279	#endif
	1280	break;
	1281
	1282	case OP_TYPEEXACT:
	1283	branchlength += GET2(cc,1);
	1284	if (cc[3] == OP_PROP \|\| cc[3] == OP_NOTPROP) cc += 2;
	1285	cc += 4;
	1286	break;
	1287
	1288	/* Handle single-char matchers */
	1289
	1290	case OP_PROP:
	1291	case OP_NOTPROP:
	1292	cc += 2;
	1293	/* Fall through */
	1294
	1295	case OP_NOT_DIGIT:
	1296	case OP_DIGIT:
	1297	case OP_NOT_WHITESPACE:
	1298	case OP_WHITESPACE:
	1299	case OP_NOT_WORDCHAR:
	1300	case OP_WORDCHAR:
	1301	case OP_ANY:
	1302	case OP_ALLANY:
	1303	branchlength++;
	1304	cc++;
	1305	break;
	1306
	1307	/* The single-byte matcher isn't allowed */
	1308
	1309	case OP_ANYBYTE:
	1310	return -2;
	1311
	1312	/* Check a class for variable quantification */
	1313
	1314	#ifdef SUPPORT_UTF8
	1315	case OP_XCLASS:
	1316	cc += GET(cc, 1) - 33;
	1317	/* Fall through */
	1318	#endif
	1319
	1320	case OP_CLASS:
	1321	case OP_NCLASS:
	1322	cc += 33;
	1323
	1324	switch (*cc)
	1325	{
	1326	case OP_CRSTAR:
	1327	case OP_CRMINSTAR:
	1328	case OP_CRQUERY:
	1329	case OP_CRMINQUERY:
	1330	return -1;
	1331
	1332	case OP_CRRANGE:
	1333	case OP_CRMINRANGE:
	1334	if (GET2(cc,1) != GET2(cc,3)) return -1;
	1335	branchlength += GET2(cc,1);
	1336	cc += 5;
	1337	break;
	1338
	1339	default:
	1340	branchlength++;
	1341	}
	1342	break;
	1343
	1344	/* Anything else is variable length */
	1345
	1346	default:
	1347	return -1;
	1348	}
	1349	}
	1350	/* Control never gets here */
	1351	}
	1352
	1353
	1354
	1355
	1356	/*************************************************
	1357	* Scan compiled regex for numbered bracket *
	1358	*************************************************/
	1359
	1360	/* This little function scans through a compiled pattern until it finds a
	1361	capturing bracket with the given number.
	1362
	1363	Arguments:
	1364	code points to start of expression
	1365	utf8 TRUE in UTF-8 mode
	1366	number the required bracket number
	1367
	1368	Returns: pointer to the opcode for the bracket, or NULL if not found
	1369	*/
	1370
	1371	static const uschar *
	1372	find_bracket(const uschar *code, BOOL utf8, int number)
	1373	{
	1374	for (;;)
	1375	{
	1376	register int c = *code;
	1377	if (c == OP_END) return NULL;
	1378
	1379	/* XCLASS is used for classes that cannot be represented just by a bit
	1380	map. This includes negated single high-valued characters. The length in
	1381	the table is zero; the actual length is stored in the compiled code. */
	1382
	1383	if (c == OP_XCLASS) code += GET(code, 1);
	1384
	1385	/* Handle capturing bracket */
	1386
	1387	else if (c == OP_CBRA)
	1388	{
	1389	int n = GET2(code, 1+LINK_SIZE);
	1390	if (n == number) return (uschar *)code;
	1391	code += _pcre_OP_lengths[c];
	1392	}
	1393
	1394	/* Otherwise, we can get the item's length from the table, except that for
	1395	repeated character types, we have to test for \p and \P, which have an extra
	1396	two bytes of parameters. */
	1397
	1398	else
	1399	{
	1400	switch(c)
	1401	{
	1402	case OP_TYPESTAR:
	1403	case OP_TYPEMINSTAR:
	1404	case OP_TYPEPLUS:
	1405	case OP_TYPEMINPLUS:
	1406	case OP_TYPEQUERY:
	1407	case OP_TYPEMINQUERY:
	1408	case OP_TYPEPOSSTAR:
	1409	case OP_TYPEPOSPLUS:
	1410	case OP_TYPEPOSQUERY:
	1411	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
	1412	break;
	1413
	1414	case OP_TYPEUPTO:
	1415	case OP_TYPEMINUPTO:
	1416	case OP_TYPEEXACT:
	1417	case OP_TYPEPOSUPTO:
	1418	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
	1419	break;
	1420	}
	1421
	1422	/* Add in the fixed length from the table */
	1423
	1424	code += _pcre_OP_lengths[c];
	1425
	1426	/* In UTF-8 mode, opcodes that are followed by a character may be followed by
	1427	a multi-byte character. The length in the table is a minimum, so we have to
	1428	arrange to skip the extra bytes. */
	1429
	1430	#ifdef SUPPORT_UTF8
	1431	if (utf8) switch(c)
	1432	{
	1433	case OP_CHAR:
	1434	case OP_CHARNC:
	1435	case OP_EXACT:
	1436	case OP_UPTO:
	1437	case OP_MINUPTO:
	1438	case OP_POSUPTO:
	1439	case OP_STAR:
	1440	case OP_MINSTAR:
	1441	case OP_POSSTAR:
	1442	case OP_PLUS:
	1443	case OP_MINPLUS:
	1444	case OP_POSPLUS:
	1445	case OP_QUERY:
	1446	case OP_MINQUERY:
	1447	case OP_POSQUERY:
	1448	if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
	1449	break;
	1450	}
	1451	#else
	1452	(void)(utf8); /* Keep compiler happy by referencing function argument */
	1453	#endif
	1454	}
	1455	}
	1456	}
	1457
	1458
	1459
	1460	/*************************************************
	1461	* Scan compiled regex for recursion reference *
	1462	*************************************************/
	1463
	1464	/* This little function scans through a compiled pattern until it finds an
	1465	instance of OP_RECURSE.
	1466
	1467	Arguments:
	1468	code points to start of expression
	1469	utf8 TRUE in UTF-8 mode
	1470
	1471	Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
	1472	*/
	1473
	1474	static const uschar *
	1475	find_recurse(const uschar *code, BOOL utf8)
	1476	{
	1477	for (;;)
	1478	{
	1479	register int c = *code;
	1480	if (c == OP_END) return NULL;
	1481	if (c == OP_RECURSE) return code;
	1482
	1483	/* XCLASS is used for classes that cannot be represented just by a bit
	1484	map. This includes negated single high-valued characters. The length in
	1485	the table is zero; the actual length is stored in the compiled code. */
	1486
	1487	if (c == OP_XCLASS) code += GET(code, 1);
	1488
	1489	/* Otherwise, we can get the item's length from the table, except that for
	1490	repeated character types, we have to test for \p and \P, which have an extra
	1491	two bytes of parameters. */
	1492
	1493	else
	1494	{
	1495	switch(c)
	1496	{
	1497	case OP_TYPESTAR:
	1498	case OP_TYPEMINSTAR:
	1499	case OP_TYPEPLUS:
	1500	case OP_TYPEMINPLUS:
	1501	case OP_TYPEQUERY:
	1502	case OP_TYPEMINQUERY:
	1503	case OP_TYPEPOSSTAR:
	1504	case OP_TYPEPOSPLUS:
	1505	case OP_TYPEPOSQUERY:
	1506	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
	1507	break;
	1508
	1509	case OP_TYPEPOSUPTO:
	1510	case OP_TYPEUPTO:
	1511	case OP_TYPEMINUPTO:
	1512	case OP_TYPEEXACT:
	1513	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
	1514	break;
	1515	}
	1516
	1517	/* Add in the fixed length from the table */
	1518
	1519	code += _pcre_OP_lengths[c];
	1520
	1521	/* In UTF-8 mode, opcodes that are followed by a character may be followed
	1522	by a multi-byte character. The length in the table is a minimum, so we have
	1523	to arrange to skip the extra bytes. */
	1524
	1525	#ifdef SUPPORT_UTF8
	1526	if (utf8) switch(c)
	1527	{
	1528	case OP_CHAR:
	1529	case OP_CHARNC:
	1530	case OP_EXACT:
	1531	case OP_UPTO:
	1532	case OP_MINUPTO:
	1533	case OP_POSUPTO:
	1534	case OP_STAR:
	1535	case OP_MINSTAR:
	1536	case OP_POSSTAR:
	1537	case OP_PLUS:
	1538	case OP_MINPLUS:
	1539	case OP_POSPLUS:
	1540	case OP_QUERY:
	1541	case OP_MINQUERY:
	1542	case OP_POSQUERY:
	1543	if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
	1544	break;
	1545	}
	1546	#else
	1547	(void)(utf8); /* Keep compiler happy by referencing function argument */
	1548	#endif
	1549	}
	1550	}
	1551	}
	1552
	1553
	1554
	1555	/*************************************************
	1556	* Scan compiled branch for non-emptiness *
	1557	*************************************************/
	1558
	1559	/* This function scans through a branch of a compiled pattern to see whether it
	1560	can match the empty string or not. It is called from could_be_empty()
	1561	below and from compile_branch() when checking for an unlimited repeat of a
	1562	group that can match nothing. Note that first_significant_code() skips over
	1563	backward and negative forward assertions when its final argument is TRUE. If we
	1564	hit an unclosed bracket, we return "empty" - this means we've struck an inner
	1565	bracket whose current branch will already have been scanned.
	1566
	1567	Arguments:
	1568	code points to start of search
	1569	endcode points to where to stop
	1570	utf8 TRUE if in UTF8 mode
	1571
	1572	Returns: TRUE if what is matched could be empty
	1573	*/
	1574
	1575	static BOOL
	1576	could_be_empty_branch(const uschar code, const uschar endcode, BOOL utf8)
	1577	{
	1578	register int c;
	1579	for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
	1580	code < endcode;
	1581	code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
	1582	{
	1583	const uschar *ccode;
	1584
	1585	c = *code;
	1586
	1587	/* Skip over forward assertions; the other assertions are skipped by
	1588	first_significant_code() with a TRUE final argument. */
	1589
	1590	if (c == OP_ASSERT)
	1591	{
	1592	do code += GET(code, 1); while (*code == OP_ALT);
	1593	c = *code;
	1594	continue;
	1595	}
	1596
	1597	/* Groups with zero repeats can of course be empty; skip them. */
	1598
	1599	if (c == OP_BRAZERO \|\| c == OP_BRAMINZERO \|\| c == OP_SKIPZERO)
	1600	{
	1601	code += _pcre_OP_lengths[c];
	1602	do code += GET(code, 1); while (*code == OP_ALT);
	1603	c = *code;
	1604	continue;
	1605	}
	1606
	1607	/* For other groups, scan the branches. */
	1608
	1609	if (c == OP_BRA \|\| c == OP_CBRA \|\| c == OP_ONCE \|\| c == OP_COND)
	1610	{
	1611	BOOL empty_branch;
	1612	if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
	1613
	1614	/* Scan a closed bracket */
	1615
	1616	empty_branch = FALSE;
	1617	do
	1618	{
	1619	if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
	1620	empty_branch = TRUE;
	1621	code += GET(code, 1);
	1622	}
	1623	while (*code == OP_ALT);
	1624	if (!empty_branch) return FALSE; /* All branches are non-empty */
	1625	c = *code;
	1626	continue;
	1627	}
	1628
	1629	/* Handle the other opcodes */
	1630
	1631	switch (c)
	1632	{
	1633	/* Check for quantifiers after a class. XCLASS is used for classes that
	1634	cannot be represented just by a bit map. This includes negated single
	1635	high-valued characters. The length in _pcre_OP_lengths[] is zero; the
	1636	actual length is stored in the compiled code, so we must update "code"
	1637	here. */
	1638
	1639	#ifdef SUPPORT_UTF8
	1640	case OP_XCLASS:
	1641	ccode = code += GET(code, 1);
	1642	goto CHECK_CLASS_REPEAT;
	1643	#endif
	1644
	1645	case OP_CLASS:
	1646	case OP_NCLASS:
	1647	ccode = code + 33;
	1648
	1649	#ifdef SUPPORT_UTF8
	1650	CHECK_CLASS_REPEAT:
	1651	#endif
	1652
	1653	switch (*ccode)
	1654	{
	1655	case OP_CRSTAR: /* These could be empty; continue */
	1656	case OP_CRMINSTAR:
	1657	case OP_CRQUERY:
	1658	case OP_CRMINQUERY:
	1659	break;
	1660
	1661	default: /* Non-repeat => class must match */
	1662	case OP_CRPLUS: /* These repeats aren't empty */
	1663	case OP_CRMINPLUS:
	1664	return FALSE;
	1665
	1666	case OP_CRRANGE:
	1667	case OP_CRMINRANGE:
	1668	if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
	1669	break;
	1670	}
	1671	break;
	1672
	1673	/* Opcodes that must match a character */
	1674
	1675	case OP_PROP:
	1676	case OP_NOTPROP:
	1677	case OP_EXTUNI:
	1678	case OP_NOT_DIGIT:
	1679	case OP_DIGIT:
	1680	case OP_NOT_WHITESPACE:
	1681	case OP_WHITESPACE:
	1682	case OP_NOT_WORDCHAR:
	1683	case OP_WORDCHAR:
	1684	case OP_ANY:
	1685	case OP_ALLANY:
	1686	case OP_ANYBYTE:
	1687	case OP_CHAR:
	1688	case OP_CHARNC:
	1689	case OP_NOT:
	1690	case OP_PLUS:
	1691	case OP_MINPLUS:
	1692	case OP_POSPLUS:
	1693	case OP_EXACT:
	1694	case OP_NOTPLUS:
	1695	case OP_NOTMINPLUS:
	1696	case OP_NOTPOSPLUS:
	1697	case OP_NOTEXACT:
	1698	case OP_TYPEPLUS:
	1699	case OP_TYPEMINPLUS:
	1700	case OP_TYPEPOSPLUS:
	1701	case OP_TYPEEXACT:
	1702	return FALSE;
	1703
	1704	/* These are going to continue, as they may be empty, but we have to
	1705	fudge the length for the \p and \P cases. */
	1706
	1707	case OP_TYPESTAR:
	1708	case OP_TYPEMINSTAR:
	1709	case OP_TYPEPOSSTAR:
	1710	case OP_TYPEQUERY:
	1711	case OP_TYPEMINQUERY:
	1712	case OP_TYPEPOSQUERY:
	1713	if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2;
	1714	break;
	1715
	1716	/* Same for these */
	1717
	1718	case OP_TYPEUPTO:
	1719	case OP_TYPEMINUPTO:
	1720	case OP_TYPEPOSUPTO:
	1721	if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2;
	1722	break;
	1723
	1724	/* End of branch */
	1725
	1726	case OP_KET:
	1727	case OP_KETRMAX:
	1728	case OP_KETRMIN:
	1729	case OP_ALT:
	1730	return TRUE;
	1731
	1732	/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
	1733	MINUPTO, and POSUPTO may be followed by a multibyte character */
	1734
	1735	#ifdef SUPPORT_UTF8
	1736	case OP_STAR:
	1737	case OP_MINSTAR:
	1738	case OP_POSSTAR:
	1739	case OP_QUERY:
	1740	case OP_MINQUERY:
	1741	case OP_POSQUERY:
	1742	case OP_UPTO:
	1743	case OP_MINUPTO:
	1744	case OP_POSUPTO:
	1745	if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
	1746	break;
	1747	#endif
	1748	}
	1749	}
	1750
	1751	return TRUE;
	1752	}
	1753
	1754
	1755
	1756	/*************************************************
	1757	* Scan compiled regex for non-emptiness *
	1758	*************************************************/
	1759
	1760	/* This function is called to check for left recursive calls. We want to check
	1761	the current branch of the current pattern to see if it could match the empty
	1762	string. If it could, we must look outwards for branches at other levels,
	1763	stopping when we pass beyond the bracket which is the subject of the recursion.
	1764
	1765	Arguments:
	1766	code points to start of the recursion
	1767	endcode points to where to stop (current RECURSE item)
	1768	bcptr points to the chain of current (unclosed) branch starts
	1769	utf8 TRUE if in UTF-8 mode
	1770
	1771	Returns: TRUE if what is matched could be empty
	1772	*/
	1773
	1774	static BOOL
	1775	could_be_empty(const uschar code, const uschar endcode, branch_chain *bcptr,
	1776	BOOL utf8)
	1777	{
	1778	while (bcptr != NULL && bcptr->current >= code)
	1779	{
	1780	if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
	1781	bcptr = bcptr->outer;
	1782	}
	1783	return TRUE;
	1784	}
	1785
	1786
	1787
	1788	/*************************************************
	1789	* Check for POSIX class syntax *
	1790	*************************************************/
	1791
	1792	/* This function is called when the sequence "[:" or "[." or "[=" is
	1793	encountered in a character class. It checks whether this is followed by a
	1794	sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
	1795	reach an unescaped ']' without the special preceding character, return FALSE.
	1796
	1797	Originally, this function only recognized a sequence of letters between the
	1798	terminators, but it seems that Perl recognizes any sequence of characters,
	1799	though of course unknown POSIX names are subsequently rejected. Perl gives an
	1800	"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
	1801	didn't consider this to be a POSIX class. Likewise for [:1234:].
	1802
	1803	The problem in trying to be exactly like Perl is in the handling of escapes. We
	1804	have to be sure that [abc[:x\]pqr] is not treated as containing a POSIX
	1805	class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
	1806	below handles the special case of \], but does not try to do any other escape
	1807	processing. This makes it different from Perl for cases such as [:l\ower:]
	1808	where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
	1809	"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
	1810	I think.
	1811
	1812	Arguments:
	1813	ptr pointer to the initial [
	1814	endptr where to return the end pointer
	1815
	1816	Returns: TRUE or FALSE
	1817	*/
	1818
	1819	static BOOL
	1820	check_posix_syntax(const uschar ptr, const uschar *endptr)
	1821	{
	1822	int terminator; /* Don't combine these lines; the Solaris cc */
	1823	terminator = (++ptr); / compiler warns about "non-constant" initializer. */
	1824	for (++ptr; *ptr != 0; ptr++)
	1825	{
	1826	if (*ptr == '\\' && ptr[1] == ']') ptr++; else
	1827	{
	1828	if (*ptr == ']') return FALSE;
	1829	if (*ptr == terminator && ptr[1] == ']')
	1830	{
	1831	*endptr = ptr;
	1832	return TRUE;
	1833	}
	1834	}
	1835	}
	1836	return FALSE;
	1837	}
	1838
	1839
	1840
	1841
	1842	/*************************************************
	1843	* Check POSIX class name *
	1844	*************************************************/
	1845
	1846	/* This function is called to check the name given in a POSIX-style class entry
	1847	such as [:alnum:].
	1848
	1849	Arguments:
	1850	ptr points to the first letter
	1851	len the length of the name
	1852
	1853	Returns: a value representing the name, or -1 if unknown
	1854	*/
	1855
	1856	static int
	1857	check_posix_name(const uschar *ptr, int len)
	1858	{
	1859	const char *pn = posix_names;
	1860	register int yield = 0;
	1861	while (posix_name_lengths[yield] != 0)
	1862	{
	1863	if (len == posix_name_lengths[yield] &&
	1864	strncmp((const char *)ptr, pn, len) == 0) return yield;
	1865	pn += posix_name_lengths[yield] + 1;
	1866	yield++;
	1867	}
	1868	return -1;
	1869	}
	1870
	1871
	1872	/*************************************************
	1873	* Adjust OP_RECURSE items in repeated group *
	1874	*************************************************/
	1875
	1876	/* OP_RECURSE items contain an offset from the start of the regex to the group
	1877	that is referenced. This means that groups can be replicated for fixed
	1878	repetition simply by copying (because the recursion is allowed to refer to
	1879	earlier groups that are outside the current group). However, when a group is
	1880	optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
	1881	inserted before it, after it has been compiled. This means that any OP_RECURSE
	1882	items within it that refer to the group itself or any contained groups have to
	1883	have their offsets adjusted. That one of the jobs of this function. Before it
	1884	is called, the partially compiled regex must be temporarily terminated with
	1885	OP_END.
	1886
	1887	This function has been extended with the possibility of forward references for
	1888	recursions and subroutine calls. It must also check the list of such references
	1889	for the group we are dealing with. If it finds that one of the recursions in
	1890	the current group is on this list, it adjusts the offset in the list, not the
	1891	value in the reference (which is a group number).
	1892
	1893	Arguments:
	1894	group points to the start of the group
	1895	adjust the amount by which the group is to be moved
	1896	utf8 TRUE in UTF-8 mode
	1897	cd contains pointers to tables etc.
	1898	save_hwm the hwm forward reference pointer at the start of the group
	1899
	1900	Returns: nothing
	1901	*/
	1902
	1903	static void
	1904	adjust_recurse(uschar group, int adjust, BOOL utf8, compile_data cd,
	1905	uschar *save_hwm)
	1906	{
	1907	uschar *ptr = group;
	1908
	1909	while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
	1910	{
	1911	int offset;
	1912	uschar *hc;
	1913
	1914	/* See if this recursion is on the forward reference list. If so, adjust the
	1915	reference. */
	1916
	1917	for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
	1918	{
	1919	offset = GET(hc, 0);
	1920	if (cd->start_code + offset == ptr + 1)
	1921	{
	1922	PUT(hc, 0, offset + adjust);
	1923	break;
	1924	}
	1925	}
	1926
	1927	/* Otherwise, adjust the recursion offset if it's after the start of this
	1928	group. */
	1929
	1930	if (hc >= cd->hwm)
	1931	{
	1932	offset = GET(ptr, 1);
	1933	if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
	1934	}
	1935
	1936	ptr += 1 + LINK_SIZE;
	1937	}
	1938	}
	1939
	1940
	1941
	1942	/*************************************************
	1943	* Insert an automatic callout point *
	1944	*************************************************/
	1945
	1946	/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
	1947	callout points before each pattern item.
	1948
	1949	Arguments:
	1950	code current code pointer
	1951	ptr current pattern pointer
	1952	cd pointers to tables etc
	1953
	1954	Returns: new code pointer
	1955	*/
	1956
	1957	static uschar *
	1958	auto_callout(uschar code, const uschar ptr, compile_data *cd)
	1959	{
	1960	*code++ = OP_CALLOUT;
	1961	*code++ = 255;
	1962	PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
	1963	PUT(code, LINK_SIZE, 0); /* Default length */
	1964	return code + 2*LINK_SIZE;
	1965	}
	1966
	1967
	1968
	1969	/*************************************************
	1970	* Complete a callout item *
	1971	*************************************************/
	1972
	1973	/* A callout item contains the length of the next item in the pattern, which
	1974	we can't fill in till after we have reached the relevant point. This is used
	1975	for both automatic and manual callouts.
	1976
	1977	Arguments:
	1978	previous_callout points to previous callout item
	1979	ptr current pattern pointer
	1980	cd pointers to tables etc
	1981
	1982	Returns: nothing
	1983	*/
	1984
	1985	static void
	1986	complete_callout(uschar previous_callout, const uschar ptr, compile_data *cd)
	1987	{
	1988	int length = ptr - cd->start_pattern - GET(previous_callout, 2);
	1989	PUT(previous_callout, 2 + LINK_SIZE, length);
	1990	}
	1991
	1992
	1993
	1994	#ifdef SUPPORT_UCP
	1995	/*************************************************
	1996	* Get othercase range *
	1997	*************************************************/
	1998
	1999	/* This function is passed the start and end of a class range, in UTF-8 mode
	2000	with UCP support. It searches up the characters, looking for internal ranges of
	2001	characters in the "other" case. Each call returns the next one, updating the
	2002	start address.
	2003
	2004	Arguments:
	2005	cptr points to starting character value; updated
	2006	d end value
	2007	ocptr where to put start of othercase range
	2008	odptr where to put end of othercase range
	2009
	2010	Yield: TRUE when range returned; FALSE when no more
	2011	*/
	2012
	2013	static BOOL
	2014	get_othercase_range(unsigned int cptr, unsigned int d, unsigned int ocptr,
	2015	unsigned int *odptr)
	2016	{
	2017	unsigned int c, othercase, next;
	2018
	2019	for (c = *cptr; c <= d; c++)
	2020	{ if ((othercase = UCD_OTHERCASE(c)) != c) break; }
	2021
	2022	if (c > d) return FALSE;
	2023
	2024	*ocptr = othercase;
	2025	next = othercase + 1;
	2026
	2027	for (++c; c <= d; c++)
	2028	{
	2029	if (UCD_OTHERCASE(c) != next) break;
	2030	next++;
	2031	}
	2032
	2033	*odptr = next - 1;
	2034	*cptr = c;
	2035
	2036	return TRUE;
	2037	}
	2038	#endif /* SUPPORT_UCP */
	2039
	2040
	2041
	2042	/*************************************************
	2043	* Check if auto-possessifying is possible *
	2044	*************************************************/
	2045
	2046	/* This function is called for unlimited repeats of certain items, to see
	2047	whether the next thing could possibly match the repeated item. If not, it makes
	2048	sense to automatically possessify the repeated item.
	2049
	2050	Arguments:
	2051	op_code the repeated op code
	2052	this data for this item, depends on the opcode
	2053	utf8 TRUE in UTF-8 mode
	2054	utf8_char used for utf8 character bytes, NULL if not relevant
	2055	ptr next character in pattern
	2056	options options bits
	2057	cd contains pointers to tables etc.
	2058
	2059	Returns: TRUE if possessifying is wanted
	2060	*/
	2061
	2062	static BOOL
	2063	check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
	2064	const uschar ptr, int options, compile_data cd)
	2065	{
	2066	int next;
	2067
	2068	/* Skip whitespace and comments in extended mode */
	2069
	2070	if ((options & PCRE_EXTENDED) != 0)
	2071	{
	2072	for (;;)
	2073	{
	2074	while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
	2075	if (*ptr == '#')
	2076	{
	2077	while (*(++ptr) != 0)
	2078	if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
	2079	}
	2080	else break;
	2081	}
	2082	}
	2083
	2084	/* If the next item is one that we can handle, get its value. A non-negative
	2085	value is a character, a negative value is an escape value. */
	2086
	2087	if (*ptr == '\\')
	2088	{
	2089	int temperrorcode = 0;
	2090	next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
	2091	if (temperrorcode != 0) return FALSE;
	2092	ptr++; /* Point after the escape sequence */
	2093	}
	2094
	2095	else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
	2096	{
	2097	#ifdef SUPPORT_UTF8
	2098	if (utf8) { GETCHARINC(next, ptr); } else
	2099	#endif
	2100	next = *ptr++;
	2101	}
	2102
	2103	else return FALSE;
	2104
	2105	/* Skip whitespace and comments in extended mode */
	2106
	2107	if ((options & PCRE_EXTENDED) != 0)
	2108	{
	2109	for (;;)
	2110	{
	2111	while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
	2112	if (*ptr == '#')
	2113	{
	2114	while (*(++ptr) != 0)
	2115	if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
	2116	}
	2117	else break;
	2118	}
	2119	}
	2120
	2121	/* If the next thing is itself optional, we have to give up. */
	2122
	2123	if (ptr == '' \|\| ptr == '?' \|\| strncmp((char )ptr, "{0,", 3) == 0)
	2124	return FALSE;
	2125
	2126	/* Now compare the next item with the previous opcode. If the previous is a
	2127	positive single character match, "item" either contains the character or, if
	2128	"item" is greater than 127 in utf8 mode, the character's bytes are in
	2129	utf8_char. */
	2130
	2131
	2132	/* Handle cases when the next item is a character. */
	2133
	2134	if (next >= 0) switch(op_code)
	2135	{
	2136	case OP_CHAR:
	2137	#ifdef SUPPORT_UTF8
	2138	if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
	2139	#else
	2140	(void)(utf8_char); /* Keep compiler happy by referencing function argument */
	2141	#endif
	2142	return item != next;
	2143
	2144	/* For CHARNC (caseless character) we must check the other case. If we have
	2145	Unicode property support, we can use it to test the other case of
	2146	high-valued characters. */
	2147
	2148	case OP_CHARNC:
	2149	#ifdef SUPPORT_UTF8
	2150	if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
	2151	#endif
	2152	if (item == next) return FALSE;
	2153	#ifdef SUPPORT_UTF8
	2154	if (utf8)
	2155	{
	2156	unsigned int othercase;
	2157	if (next < 128) othercase = cd->fcc[next]; else
	2158	#ifdef SUPPORT_UCP
	2159	othercase = UCD_OTHERCASE((unsigned int)next);
	2160	#else
	2161	othercase = NOTACHAR;
	2162	#endif
	2163	return (unsigned int)item != othercase;
	2164	}
	2165	else
	2166	#endif /* SUPPORT_UTF8 */
	2167	return (item != cd->fcc[next]); /* Non-UTF-8 mode */
	2168
	2169	/* For OP_NOT, "item" must be a single-byte character. */
	2170
	2171	case OP_NOT:
	2172	if (item == next) return TRUE;
	2173	if ((options & PCRE_CASELESS) == 0) return FALSE;
	2174	#ifdef SUPPORT_UTF8
	2175	if (utf8)
	2176	{
	2177	unsigned int othercase;
	2178	if (next < 128) othercase = cd->fcc[next]; else
	2179	#ifdef SUPPORT_UCP
	2180	othercase = UCD_OTHERCASE(next);
	2181	#else
	2182	othercase = NOTACHAR;
	2183	#endif
	2184	return (unsigned int)item == othercase;
	2185	}
	2186	else
	2187	#endif /* SUPPORT_UTF8 */
	2188	return (item == cd->fcc[next]); /* Non-UTF-8 mode */
	2189
	2190	case OP_DIGIT:
	2191	return next > 127 \|\| (cd->ctypes[next] & ctype_digit) == 0;
	2192
	2193	case OP_NOT_DIGIT:
	2194	return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
	2195
	2196	case OP_WHITESPACE:
	2197	return next > 127 \|\| (cd->ctypes[next] & ctype_space) == 0;
	2198
	2199	case OP_NOT_WHITESPACE:
	2200	return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
	2201
	2202	case OP_WORDCHAR:
	2203	return next > 127 \|\| (cd->ctypes[next] & ctype_word) == 0;
	2204
	2205	case OP_NOT_WORDCHAR:
	2206	return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
	2207
	2208	case OP_HSPACE:
	2209	case OP_NOT_HSPACE:
	2210	switch(next)
	2211	{
	2212	case 0x09:
	2213	case 0x20:
	2214	case 0xa0:
	2215	case 0x1680:
	2216	case 0x180e:
	2217	case 0x2000:
	2218	case 0x2001:
	2219	case 0x2002:
	2220	case 0x2003:
	2221	case 0x2004:
	2222	case 0x2005:
	2223	case 0x2006:
	2224	case 0x2007:
	2225	case 0x2008:
	2226	case 0x2009:
	2227	case 0x200A:
	2228	case 0x202f:
	2229	case 0x205f:
	2230	case 0x3000:
	2231	return op_code != OP_HSPACE;
	2232	default:
	2233	return op_code == OP_HSPACE;
	2234	}
	2235
	2236	case OP_VSPACE:
	2237	case OP_NOT_VSPACE:
	2238	switch(next)
	2239	{
	2240	case 0x0a:
	2241	case 0x0b:
	2242	case 0x0c:
	2243	case 0x0d:
	2244	case 0x85:
	2245	case 0x2028:
	2246	case 0x2029:
	2247	return op_code != OP_VSPACE;
	2248	default:
	2249	return op_code == OP_VSPACE;
	2250	}
	2251
	2252	default:
	2253	return FALSE;
	2254	}
	2255
	2256
	2257	/* Handle the case when the next item is \d, \s, etc. */
	2258
	2259	switch(op_code)
	2260	{
	2261	case OP_CHAR:
	2262	case OP_CHARNC:
	2263	#ifdef SUPPORT_UTF8
	2264	if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
	2265	#endif
	2266	switch(-next)
	2267	{
	2268	case ESC_d:
	2269	return item > 127 \|\| (cd->ctypes[item] & ctype_digit) == 0;
	2270
	2271	case ESC_D:
	2272	return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
	2273
	2274	case ESC_s:
	2275	return item > 127 \|\| (cd->ctypes[item] & ctype_space) == 0;
	2276
	2277	case ESC_S:
	2278	return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
	2279
	2280	case ESC_w:
	2281	return item > 127 \|\| (cd->ctypes[item] & ctype_word) == 0;
	2282
	2283	case ESC_W:
	2284	return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
	2285
	2286	case ESC_h:
	2287	case ESC_H:
	2288	switch(item)
	2289	{
	2290	case 0x09:
	2291	case 0x20:
	2292	case 0xa0:
	2293	case 0x1680:
	2294	case 0x180e:
	2295	case 0x2000:
	2296	case 0x2001:
	2297	case 0x2002:
	2298	case 0x2003:
	2299	case 0x2004:
	2300	case 0x2005:
	2301	case 0x2006:
	2302	case 0x2007:
	2303	case 0x2008:
	2304	case 0x2009:
	2305	case 0x200A:
	2306	case 0x202f:
	2307	case 0x205f:
	2308	case 0x3000:
	2309	return -next != ESC_h;
	2310	default:
	2311	return -next == ESC_h;
	2312	}
	2313
	2314	case ESC_v:
	2315	case ESC_V:
	2316	switch(item)
	2317	{
	2318	case 0x0a:
	2319	case 0x0b:
	2320	case 0x0c:
	2321	case 0x0d:
	2322	case 0x85:
	2323	case 0x2028:
	2324	case 0x2029:
	2325	return -next != ESC_v;
	2326	default:
	2327	return -next == ESC_v;
	2328	}
	2329
	2330	default:
	2331	return FALSE;
	2332	}
	2333
	2334	case OP_DIGIT:
	2335	return next == -ESC_D \|\| next == -ESC_s \|\| next == -ESC_W \|\|
	2336	next == -ESC_h \|\| next == -ESC_v;
	2337
	2338	case OP_NOT_DIGIT:
	2339	return next == -ESC_d;
	2340
	2341	case OP_WHITESPACE:
	2342	return next == -ESC_S \|\| next == -ESC_d \|\| next == -ESC_w;
	2343
	2344	case OP_NOT_WHITESPACE:
	2345	return next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v;
	2346
	2347	case OP_HSPACE:
	2348	return next == -ESC_S \|\| next == -ESC_H \|\| next == -ESC_d \|\| next == -ESC_w;
	2349
	2350	case OP_NOT_HSPACE:
	2351	return next == -ESC_h;
	2352
	2353	/* Can't have \S in here because VT matches \S (Perl anomaly) */
	2354	case OP_VSPACE:
	2355	return next == -ESC_V \|\| next == -ESC_d \|\| next == -ESC_w;
	2356
	2357	case OP_NOT_VSPACE:
	2358	return next == -ESC_v;
	2359
	2360	case OP_WORDCHAR:
	2361	return next == -ESC_W \|\| next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v;
	2362
	2363	case OP_NOT_WORDCHAR:
	2364	return next == -ESC_w \|\| next == -ESC_d;
	2365
	2366	default:
	2367	return FALSE;
	2368	}
	2369
	2370	/* Control does not reach here */
	2371	}
	2372
	2373
	2374
	2375	/*************************************************
	2376	* Compile one branch *
	2377	*************************************************/
	2378
	2379	/* Scan the pattern, compiling it into the a vector. If the options are
	2380	changed during the branch, the pointer is used to change the external options
	2381	bits. This function is used during the pre-compile phase when we are trying
	2382	to find out the amount of memory needed, as well as during the real compile
	2383	phase. The value of lengthptr distinguishes the two phases.
	2384
	2385	Arguments:
	2386	optionsptr pointer to the option bits
	2387	codeptr points to the pointer to the current code point
	2388	ptrptr points to the current pattern pointer
	2389	errorcodeptr points to error code variable
	2390	firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
	2391	reqbyteptr set to the last literal character required, else < 0
	2392	bcptr points to current branch chain
	2393	cd contains pointers to tables etc.
	2394	lengthptr NULL during the real compile phase
	2395	points to length accumulator during pre-compile phase
	2396
	2397	Returns: TRUE on success
	2398	FALSE, with *errorcodeptr set non-zero on error
	2399	*/
	2400
	2401	static BOOL
	2402	compile_branch(int optionsptr, uschar codeptr, const uschar *ptrptr,
	2403	int errorcodeptr, int firstbyteptr, int reqbyteptr, branch_chain bcptr,
	2404	compile_data cd, int lengthptr)
	2405	{
	2406	int repeat_type, op_type;
	2407	int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
	2408	int bravalue = 0;
	2409	int greedy_default, greedy_non_default;
	2410	int firstbyte, reqbyte;
	2411	int zeroreqbyte, zerofirstbyte;
	2412	int req_caseopt, reqvary, tempreqvary;
	2413	int options = *optionsptr;
	2414	int after_manual_callout = 0;
	2415	int length_prevgroup = 0;
	2416	register int c;
	2417	register uschar code = codeptr;
	2418	uschar *last_code = code;
	2419	uschar *orig_code = code;
	2420	uschar *tempcode;
	2421	BOOL inescq = FALSE;
	2422	BOOL groupsetfirstbyte = FALSE;
	2423	const uschar ptr = ptrptr;
	2424	const uschar *tempptr;
	2425	uschar *previous = NULL;
	2426	uschar *previous_callout = NULL;
	2427	uschar *save_hwm = NULL;
	2428	uschar classbits[32];
	2429
	2430	#ifdef SUPPORT_UTF8
	2431	BOOL class_utf8;
	2432	BOOL utf8 = (options & PCRE_UTF8) != 0;
	2433	uschar *class_utf8data;
	2434	uschar *class_utf8data_base;
	2435	uschar utf8_char[6];
	2436	#else
	2437	BOOL utf8 = FALSE;
	2438	uschar *utf8_char = NULL;
	2439	#endif
	2440
	2441	#ifdef DEBUG
	2442	if (lengthptr != NULL) DPRINTF((">> start branch\n"));
	2443	#endif
	2444
	2445	/* Set up the default and non-default settings for greediness */
	2446
	2447	greedy_default = ((options & PCRE_UNGREEDY) != 0);
	2448	greedy_non_default = greedy_default ^ 1;
	2449
	2450	/* Initialize no first byte, no required byte. REQ_UNSET means "no char
	2451	matching encountered yet". It gets changed to REQ_NONE if we hit something that
	2452	matches a non-fixed char first char; reqbyte just remains unset if we never
	2453	find one.
	2454
	2455	When we hit a repeat whose minimum is zero, we may have to adjust these values
	2456	to take the zero repeat into account. This is implemented by setting them to
	2457	zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
	2458	item types that can be repeated set these backoff variables appropriately. */
	2459
	2460	firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
	2461
	2462	/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
	2463	according to the current setting of the caseless flag. REQ_CASELESS is a bit
	2464	value > 255. It is added into the firstbyte or reqbyte variables to record the
	2465	case status of the value. This is used only for ASCII characters. */
	2466
	2467	req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
	2468
	2469	/* Switch on next character until the end of the branch */
	2470
	2471	for (;; ptr++)
	2472	{
	2473	BOOL negate_class;
	2474	BOOL should_flip_negation;
	2475	BOOL possessive_quantifier;
	2476	BOOL is_quantifier;
	2477	BOOL is_recurse;
	2478	BOOL reset_bracount;
	2479	int class_charcount;
	2480	int class_lastchar;
	2481	int newoptions;
	2482	int recno;
	2483	int refsign;
	2484	int skipbytes;
	2485	int subreqbyte;
	2486	int subfirstbyte;
	2487	int terminator;
	2488	int mclength;
	2489	uschar mcbuffer[8];
	2490
	2491	/* Get next byte in the pattern */
	2492
	2493	c = *ptr;
	2494
	2495	/* If we are in the pre-compile phase, accumulate the length used for the
	2496	previous cycle of this loop. */
	2497
	2498	if (lengthptr != NULL)
	2499	{
	2500	#ifdef DEBUG
	2501	if (code > cd->hwm) cd->hwm = code; /* High water info */
	2502	#endif
	2503	if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
	2504	{
	2505	*errorcodeptr = ERR52;
	2506	goto FAILED;
	2507	}
	2508
	2509	/* There is at least one situation where code goes backwards: this is the
	2510	case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
	2511	the class is simply eliminated. However, it is created first, so we have to
	2512	allow memory for it. Therefore, don't ever reduce the length at this point.
	2513	*/
	2514
	2515	if (code < last_code) code = last_code;
	2516
	2517	/* Paranoid check for integer overflow */
	2518
	2519	if (OFLOW_MAX - *lengthptr < code - last_code)
	2520	{
	2521	*errorcodeptr = ERR20;
	2522	goto FAILED;
	2523	}
	2524
	2525	*lengthptr += code - last_code;
	2526	DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
	2527
	2528	/* If "previous" is set and it is not at the start of the work space, move
	2529	it back to there, in order to avoid filling up the work space. Otherwise,
	2530	if "previous" is NULL, reset the current code pointer to the start. */
	2531
	2532	if (previous != NULL)
	2533	{
	2534	if (previous > orig_code)
	2535	{
	2536	memmove(orig_code, previous, code - previous);
	2537	code -= previous - orig_code;
	2538	previous = orig_code;
	2539	}
	2540	}
	2541	else code = orig_code;
	2542
	2543	/* Remember where this code item starts so we can pick up the length
	2544	next time round. */
	2545
	2546	last_code = code;
	2547	}
	2548
	2549	/* In the real compile phase, just check the workspace used by the forward
	2550	reference list. */
	2551
	2552	else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
	2553	{
	2554	*errorcodeptr = ERR52;
	2555	goto FAILED;
	2556	}
	2557
	2558	/* If in \Q...\E, check for the end; if not, we have a literal */
	2559
	2560	if (inescq && c != 0)
	2561	{
	2562	if (c == '\\' && ptr[1] == 'E')
	2563	{
	2564	inescq = FALSE;
	2565	ptr++;
	2566	continue;
	2567	}
	2568	else
	2569	{
	2570	if (previous_callout != NULL)
	2571	{
	2572	if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
	2573	complete_callout(previous_callout, ptr, cd);
	2574	previous_callout = NULL;
	2575	}
	2576	if ((options & PCRE_AUTO_CALLOUT) != 0)
	2577	{
	2578	previous_callout = code;
	2579	code = auto_callout(code, ptr, cd);
	2580	}
	2581	goto NORMAL_CHAR;
	2582	}
	2583	}
	2584
	2585	/* Fill in length of a previous callout, except when the next thing is
	2586	a quantifier. */
	2587
	2588	is_quantifier = c == '*' \|\| c == '+' \|\| c == '?' \|\|
	2589	(c == '{' && is_counted_repeat(ptr+1));
	2590
	2591	if (!is_quantifier && previous_callout != NULL &&
	2592	after_manual_callout-- <= 0)
	2593	{
	2594	if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
	2595	complete_callout(previous_callout, ptr, cd);
	2596	previous_callout = NULL;
	2597	}
	2598
	2599	/* In extended mode, skip white space and comments */
	2600
	2601	if ((options & PCRE_EXTENDED) != 0)
	2602	{
	2603	if ((cd->ctypes[c] & ctype_space) != 0) continue;
	2604	if (c == '#')
	2605	{
	2606	while (*(++ptr) != 0)
	2607	{
	2608	if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
	2609	}
	2610	if (*ptr != 0) continue;
	2611
	2612	/* Else fall through to handle end of string */
	2613	c = 0;
	2614	}
	2615	}
	2616
	2617	/* No auto callout for quantifiers. */
	2618
	2619	if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
	2620	{
	2621	previous_callout = code;
	2622	code = auto_callout(code, ptr, cd);
	2623	}
	2624
	2625	switch(c)
	2626	{
	2627	/* ===================================================================*/
	2628	case 0: /* The branch terminates at string end */
	2629	case '\|': /* or \| or ) */
	2630	case ')':
	2631	*firstbyteptr = firstbyte;
	2632	*reqbyteptr = reqbyte;
	2633	*codeptr = code;
	2634	*ptrptr = ptr;
	2635	if (lengthptr != NULL)
	2636	{
	2637	if (OFLOW_MAX - *lengthptr < code - last_code)
	2638	{
	2639	*errorcodeptr = ERR20;
	2640	goto FAILED;
	2641	}
	2642	lengthptr += code - last_code; / To include callout length */
	2643	DPRINTF((">> end branch\n"));
	2644	}
	2645	return TRUE;
	2646
	2647
	2648	/* ===================================================================*/
	2649	/* Handle single-character metacharacters. In multiline mode, ^ disables
	2650	the setting of any following char as a first character. */
	2651
	2652	case '^':
	2653	if ((options & PCRE_MULTILINE) != 0)
	2654	{
	2655	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	2656	}
	2657	previous = NULL;
	2658	*code++ = OP_CIRC;
	2659	break;
	2660
	2661	case '$':
	2662	previous = NULL;
	2663	*code++ = OP_DOLL;
	2664	break;
	2665
	2666	/* There can never be a first char if '.' is first, whatever happens about
	2667	repeats. The value of reqbyte doesn't change either. */
	2668
	2669	case '.':
	2670	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	2671	zerofirstbyte = firstbyte;
	2672	zeroreqbyte = reqbyte;
	2673	previous = code;
	2674	*code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
	2675	break;
	2676
	2677
	2678	/* ===================================================================*/
	2679	/* Character classes. If the included characters are all < 256, we build a
	2680	32-byte bitmap of the permitted characters, except in the special case
	2681	where there is only one such character. For negated classes, we build the
	2682	map as usual, then invert it at the end. However, we use a different opcode
	2683	so that data characters > 255 can be handled correctly.
	2684
	2685	If the class contains characters outside the 0-255 range, a different
	2686	opcode is compiled. It may optionally have a bit map for characters < 256,
	2687	but those above are are explicitly listed afterwards. A flag byte tells
	2688	whether the bitmap is present, and whether this is a negated class or not.
	2689
	2690	In JavaScript compatibility mode, an isolated ']' causes an error. In
	2691	default (Perl) mode, it is treated as a data character. */
	2692
	2693	case ']':
	2694	if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
	2695	{
	2696	*errorcodeptr = ERR64;
	2697	goto FAILED;
	2698	}
	2699	goto NORMAL_CHAR;
	2700
	2701	case '[':
	2702	previous = code;
	2703
	2704	/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
	2705	they are encountered at the top level, so we'll do that too. */
	2706
	2707	if ((ptr[1] == ':' \|\| ptr[1] == '.' \|\| ptr[1] == '=') &&
	2708	check_posix_syntax(ptr, &tempptr))
	2709	{
	2710	*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
	2711	goto FAILED;
	2712	}
	2713
	2714	/* If the first character is '^', set the negation flag and skip it. Also,
	2715	if the first few characters (either before or after ^) are \Q\E or \E we
	2716	skip them too. This makes for compatibility with Perl. */
	2717
	2718	negate_class = FALSE;
	2719	for (;;)
	2720	{
	2721	c = *(++ptr);
	2722	if (c == '\\')
	2723	{
	2724	if (ptr[1] == 'E') ptr++;
	2725	else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
	2726	else break;
	2727	}
	2728	else if (!negate_class && c == '^')
	2729	negate_class = TRUE;
	2730	else break;
	2731	}
	2732
	2733	/* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
	2734	an initial ']' is taken as a data character -- the code below handles
	2735	that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
	2736	[^] must match any character, so generate OP_ALLANY. */
	2737
	2738	if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
	2739	{
	2740	*code++ = negate_class? OP_ALLANY : OP_FAIL;
	2741	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	2742	zerofirstbyte = firstbyte;
	2743	break;
	2744	}
	2745
	2746	/* If a class contains a negative special such as \S, we need to flip the
	2747	negation flag at the end, so that support for characters > 255 works
	2748	correctly (they are all included in the class). */
	2749
	2750	should_flip_negation = FALSE;
	2751
	2752	/* Keep a count of chars with values < 256 so that we can optimize the case
	2753	of just a single character (as long as it's < 256). However, For higher
	2754	valued UTF-8 characters, we don't yet do any optimization. */
	2755
	2756	class_charcount = 0;
	2757	class_lastchar = -1;
	2758
	2759	/* Initialize the 32-char bit map to all zeros. We build the map in a
	2760	temporary bit of memory, in case the class contains only 1 character (less
	2761	than 256), because in that case the compiled code doesn't use the bit map.
	2762	*/
	2763
	2764	memset(classbits, 0, 32 * sizeof(uschar));
	2765
	2766	#ifdef SUPPORT_UTF8
	2767	class_utf8 = FALSE; /* No chars >= 256 */
	2768	class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
	2769	class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
	2770	#endif
	2771
	2772	/* Process characters until ] is reached. By writing this as a "do" it
	2773	means that an initial ] is taken as a data character. At the start of the
	2774	loop, c contains the first byte of the character. */
	2775
	2776	if (c != 0) do
	2777	{
	2778	const uschar *oldptr;
	2779
	2780	#ifdef SUPPORT_UTF8
	2781	if (utf8 && c > 127)
	2782	{ /* Braces are required because the */
	2783	GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
	2784	}
	2785
	2786	/* In the pre-compile phase, accumulate the length of any UTF-8 extra
	2787	data and reset the pointer. This is so that very large classes that
	2788	contain a zillion UTF-8 characters no longer overwrite the work space
	2789	(which is on the stack). */
	2790
	2791	if (lengthptr != NULL)
	2792	{
	2793	*lengthptr += class_utf8data - class_utf8data_base;
	2794	class_utf8data = class_utf8data_base;
	2795	}
	2796
	2797	#endif
	2798
	2799	/* Inside \Q...\E everything is literal except \E */
	2800
	2801	if (inescq)
	2802	{
	2803	if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
	2804	{
	2805	inescq = FALSE; /* Reset literal state */
	2806	ptr++; /* Skip the 'E' */
	2807	continue; /* Carry on with next */
	2808	}
	2809	goto CHECK_RANGE; /* Could be range if \E follows */
	2810	}
	2811
	2812	/* Handle POSIX class names. Perl allows a negation extension of the
	2813	form [:^name:]. A square bracket that doesn't match the syntax is
	2814	treated as a literal. We also recognize the POSIX constructions
	2815	[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
	2816	5.6 and 5.8 do. */
	2817
	2818	if (c == '[' &&
	2819	(ptr[1] == ':' \|\| ptr[1] == '.' \|\| ptr[1] == '=') &&
	2820	check_posix_syntax(ptr, &tempptr))
	2821	{
	2822	BOOL local_negate = FALSE;
	2823	int posix_class, taboffset, tabopt;
	2824	register const uschar *cbits = cd->cbits;
	2825	uschar pbits[32];
	2826
	2827	if (ptr[1] != ':')
	2828	{
	2829	*errorcodeptr = ERR31;
	2830	goto FAILED;
	2831	}
	2832
	2833	ptr += 2;
	2834	if (*ptr == '^')
	2835	{
	2836	local_negate = TRUE;
	2837	should_flip_negation = TRUE; /* Note negative special */
	2838	ptr++;
	2839	}
	2840
	2841	posix_class = check_posix_name(ptr, tempptr - ptr);
	2842	if (posix_class < 0)
	2843	{
	2844	*errorcodeptr = ERR30;
	2845	goto FAILED;
	2846	}
	2847
	2848	/* If matching is caseless, upper and lower are converted to
	2849	alpha. This relies on the fact that the class table starts with
	2850	alpha, lower, upper as the first 3 entries. */
	2851
	2852	if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
	2853	posix_class = 0;
	2854
	2855	/* We build the bit map for the POSIX class in a chunk of local store
	2856	because we may be adding and subtracting from it, and we don't want to
	2857	subtract bits that may be in the main map already. At the end we or the
	2858	result into the bit map that is being built. */
	2859
	2860	posix_class *= 3;
	2861
	2862	/* Copy in the first table (always present) */
	2863
	2864	memcpy(pbits, cbits + posix_class_maps[posix_class],
	2865	32 * sizeof(uschar));
	2866
	2867	/* If there is a second table, add or remove it as required. */
	2868
	2869	taboffset = posix_class_maps[posix_class + 1];
	2870	tabopt = posix_class_maps[posix_class + 2];
	2871
	2872	if (taboffset >= 0)
	2873	{
	2874	if (tabopt >= 0)
	2875	for (c = 0; c < 32; c++) pbits[c] \|= cbits[c + taboffset];
	2876	else
	2877	for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
	2878	}
	2879
	2880	/* Not see if we need to remove any special characters. An option
	2881	value of 1 removes vertical space and 2 removes underscore. */
	2882
	2883	if (tabopt < 0) tabopt = -tabopt;
	2884	if (tabopt == 1) pbits[1] &= ~0x3c;
	2885	else if (tabopt == 2) pbits[11] &= 0x7f;
	2886
	2887	/* Add the POSIX table or its complement into the main table that is
	2888	being built and we are done. */
	2889
	2890	if (local_negate)
	2891	for (c = 0; c < 32; c++) classbits[c] \|= ~pbits[c];
	2892	else
	2893	for (c = 0; c < 32; c++) classbits[c] \|= pbits[c];
	2894
	2895	ptr = tempptr + 1;
	2896	class_charcount = 10; /* Set > 1; assumes more than 1 per class */
	2897	continue; /* End of POSIX syntax handling */
	2898	}
	2899
	2900	/* Backslash may introduce a single character, or it may introduce one
	2901	of the specials, which just set a flag. The sequence \b is a special
	2902	case. Inside a class (and only there) it is treated as backspace.
	2903	Elsewhere it marks a word boundary. Other escapes have preset maps ready
	2904	to 'or' into the one we are building. We assume they have more than one
	2905	character in them, so set class_charcount bigger than one. */
	2906
	2907	if (c == '\\')
	2908	{
	2909	c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
	2910	if (*errorcodeptr != 0) goto FAILED;
	2911
	2912	if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
	2913	else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
	2914	else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
	2915	else if (-c == ESC_Q) /* Handle start of quoted string */
	2916	{
	2917	if (ptr[1] == '\\' && ptr[2] == 'E')
	2918	{
	2919	ptr += 2; /* avoid empty string */
	2920	}
	2921	else inescq = TRUE;
	2922	continue;
	2923	}
	2924	else if (-c == ESC_E) continue; /* Ignore orphan \E */
	2925
	2926	if (c < 0)
	2927	{
	2928	register const uschar *cbits = cd->cbits;
	2929	class_charcount += 2; /* Greater than 1 is what matters */
	2930
	2931	/* Save time by not doing this in the pre-compile phase. */
	2932
	2933	if (lengthptr == NULL) switch (-c)
	2934	{
	2935	case ESC_d:
	2936	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_digit];
	2937	continue;
	2938
	2939	case ESC_D:
	2940	should_flip_negation = TRUE;
	2941	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_digit];
	2942	continue;
	2943
	2944	case ESC_w:
	2945	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_word];
	2946	continue;
	2947
	2948	case ESC_W:
	2949	should_flip_negation = TRUE;
	2950	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_word];
	2951	continue;
	2952
	2953	case ESC_s:
	2954	for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_space];
	2955	classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
	2956	continue;
	2957
	2958	case ESC_S:
	2959	should_flip_negation = TRUE;
	2960	for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_space];
	2961	classbits[1] \|= 0x08; /* Perl 5.004 onwards omits VT from \s */
	2962	continue;
	2963
	2964	default: /* Not recognized; fall through */
	2965	break; /* Need "default" setting to stop compiler warning. */
	2966	}
	2967
	2968	/* In the pre-compile phase, just do the recognition. */
	2969
	2970	else if (c == -ESC_d \|\| c == -ESC_D \|\| c == -ESC_w \|\|
	2971	c == -ESC_W \|\| c == -ESC_s \|\| c == -ESC_S) continue;
	2972
	2973	/* We need to deal with \H, \h, \V, and \v in both phases because
	2974	they use extra memory. */
	2975
	2976	if (-c == ESC_h)
	2977	{
	2978	SETBIT(classbits, 0x09); /* VT */
	2979	SETBIT(classbits, 0x20); /* SPACE */
	2980	SETBIT(classbits, 0xa0); /* NSBP */
	2981	#ifdef SUPPORT_UTF8
	2982	if (utf8)
	2983	{
	2984	class_utf8 = TRUE;
	2985	*class_utf8data++ = XCL_SINGLE;
	2986	class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
	2987	*class_utf8data++ = XCL_SINGLE;
	2988	class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
	2989	*class_utf8data++ = XCL_RANGE;
	2990	class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
	2991	class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
	2992	*class_utf8data++ = XCL_SINGLE;
	2993	class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
	2994	*class_utf8data++ = XCL_SINGLE;
	2995	class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
	2996	*class_utf8data++ = XCL_SINGLE;
	2997	class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
	2998	}
	2999	#endif
	3000	continue;
	3001	}
	3002
	3003	if (-c == ESC_H)
	3004	{
	3005	for (c = 0; c < 32; c++)
	3006	{
	3007	int x = 0xff;
	3008	switch (c)
	3009	{
	3010	case 0x09/8: x ^= 1 << (0x09%8); break;
	3011	case 0x20/8: x ^= 1 << (0x20%8); break;
	3012	case 0xa0/8: x ^= 1 << (0xa0%8); break;
	3013	default: break;
	3014	}
	3015	classbits[c] \|= x;
	3016	}
	3017
	3018	#ifdef SUPPORT_UTF8
	3019	if (utf8)
	3020	{
	3021	class_utf8 = TRUE;
	3022	*class_utf8data++ = XCL_RANGE;
	3023	class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
	3024	class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
	3025	*class_utf8data++ = XCL_RANGE;
	3026	class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
	3027	class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
	3028	*class_utf8data++ = XCL_RANGE;
	3029	class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
	3030	class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
	3031	*class_utf8data++ = XCL_RANGE;
	3032	class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
	3033	class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
	3034	*class_utf8data++ = XCL_RANGE;
	3035	class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
	3036	class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
	3037	*class_utf8data++ = XCL_RANGE;
	3038	class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
	3039	class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
	3040	*class_utf8data++ = XCL_RANGE;
	3041	class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
	3042	class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
	3043	}
	3044	#endif
	3045	continue;
	3046	}
	3047
	3048	if (-c == ESC_v)
	3049	{
	3050	SETBIT(classbits, 0x0a); /* LF */
	3051	SETBIT(classbits, 0x0b); /* VT */
	3052	SETBIT(classbits, 0x0c); /* FF */
	3053	SETBIT(classbits, 0x0d); /* CR */
	3054	SETBIT(classbits, 0x85); /* NEL */
	3055	#ifdef SUPPORT_UTF8
	3056	if (utf8)
	3057	{
	3058	class_utf8 = TRUE;
	3059	*class_utf8data++ = XCL_RANGE;
	3060	class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
	3061	class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
	3062	}
	3063	#endif
	3064	continue;
	3065	}
	3066
	3067	if (-c == ESC_V)
	3068	{
	3069	for (c = 0; c < 32; c++)
	3070	{
	3071	int x = 0xff;
	3072	switch (c)
	3073	{
	3074	case 0x0a/8: x ^= 1 << (0x0a%8);
	3075	x ^= 1 << (0x0b%8);
	3076	x ^= 1 << (0x0c%8);
	3077	x ^= 1 << (0x0d%8);
	3078	break;
	3079	case 0x85/8: x ^= 1 << (0x85%8); break;
	3080	default: break;
	3081	}
	3082	classbits[c] \|= x;
	3083	}
	3084
	3085	#ifdef SUPPORT_UTF8
	3086	if (utf8)
	3087	{
	3088	class_utf8 = TRUE;
	3089	*class_utf8data++ = XCL_RANGE;
	3090	class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
	3091	class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
	3092	*class_utf8data++ = XCL_RANGE;
	3093	class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
	3094	class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
	3095	}
	3096	#endif
	3097	continue;
	3098	}
	3099
	3100	/* We need to deal with \P and \p in both phases. */
	3101
	3102	#ifdef SUPPORT_UCP
	3103	if (-c == ESC_p \|\| -c == ESC_P)
	3104	{
	3105	BOOL negated;
	3106	int pdata;
	3107	int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
	3108	if (ptype < 0) goto FAILED;
	3109	class_utf8 = TRUE;
	3110	*class_utf8data++ = ((-c == ESC_p) != negated)?
	3111	XCL_PROP : XCL_NOTPROP;
	3112	*class_utf8data++ = ptype;
	3113	*class_utf8data++ = pdata;
	3114	class_charcount -= 2; /* Not a < 256 character */
	3115	continue;
	3116	}
	3117	#endif
	3118	/* Unrecognized escapes are faulted if PCRE is running in its
	3119	strict mode. By default, for compatibility with Perl, they are
	3120	treated as literals. */
	3121
	3122	if ((options & PCRE_EXTRA) != 0)
	3123	{
	3124	*errorcodeptr = ERR7;
	3125	goto FAILED;
	3126	}
	3127
	3128	class_charcount -= 2; /* Undo the default count from above */
	3129	c = ptr; / Get the final character and fall through */
	3130	}
	3131
	3132	/* Fall through if we have a single character (c >= 0). This may be
	3133	greater than 256 in UTF-8 mode. */
	3134
	3135	} /* End of backslash handling */
	3136
	3137	/* A single character may be followed by '-' to form a range. However,
	3138	Perl does not permit ']' to be the end of the range. A '-' character
	3139	at the end is treated as a literal. Perl ignores orphaned \E sequences
	3140	entirely. The code for handling \Q and \E is messy. */
	3141
	3142	CHECK_RANGE:
	3143	while (ptr[1] == '\\' && ptr[2] == 'E')
	3144	{
	3145	inescq = FALSE;
	3146	ptr += 2;
	3147	}
	3148
	3149	oldptr = ptr;
	3150
	3151	/* Remember \r or \n */
	3152
	3153	if (c == '\r' \|\| c == '\n') cd->external_flags \|= PCRE_HASCRORLF;
	3154
	3155	/* Check for range */
	3156
	3157	if (!inescq && ptr[1] == '-')
	3158	{
	3159	int d;
	3160	ptr += 2;
	3161	while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
	3162
	3163	/* If we hit \Q (not followed by \E) at this point, go into escaped
	3164	mode. */
	3165
	3166	while (*ptr == '\\' && ptr[1] == 'Q')
	3167	{
	3168	ptr += 2;
	3169	if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
	3170	inescq = TRUE;
	3171	break;
	3172	}
	3173
	3174	if (ptr == 0 \|\| (!inescq && ptr == ']'))
	3175	{
	3176	ptr = oldptr;
	3177	goto LONE_SINGLE_CHARACTER;
	3178	}
	3179
	3180	#ifdef SUPPORT_UTF8
	3181	if (utf8)
	3182	{ /* Braces are required because the */
	3183	GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
	3184	}
	3185	else
	3186	#endif
	3187	d = ptr; / Not UTF-8 mode */
	3188
	3189	/* The second part of a range can be a single-character escape, but
	3190	not any of the other escapes. Perl 5.6 treats a hyphen as a literal
	3191	in such circumstances. */
	3192
	3193	if (!inescq && d == '\\')
	3194	{
	3195	d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
	3196	if (*errorcodeptr != 0) goto FAILED;
	3197
	3198	/* \b is backspace; \X is literal X; \R is literal R; any other
	3199	special means the '-' was literal */
	3200
	3201	if (d < 0)
	3202	{
	3203	if (d == -ESC_b) d = '\b';
	3204	else if (d == -ESC_X) d = 'X';
	3205	else if (d == -ESC_R) d = 'R'; else
	3206	{
	3207	ptr = oldptr;
	3208	goto LONE_SINGLE_CHARACTER; /* A few lines below */
	3209	}
	3210	}
	3211	}
	3212
	3213	/* Check that the two values are in the correct order. Optimize
	3214	one-character ranges */
	3215
	3216	if (d < c)
	3217	{
	3218	*errorcodeptr = ERR8;
	3219	goto FAILED;
	3220	}
	3221
	3222	if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
	3223
	3224	/* Remember \r or \n */
	3225
	3226	if (d == '\r' \|\| d == '\n') cd->external_flags \|= PCRE_HASCRORLF;
	3227
	3228	/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
	3229	matching, we have to use an XCLASS with extra data items. Caseless
	3230	matching for characters > 127 is available only if UCP support is
	3231	available. */
	3232
	3233	#ifdef SUPPORT_UTF8
	3234	if (utf8 && (d > 255 \|\| ((options & PCRE_CASELESS) != 0 && d > 127)))
	3235	{
	3236	class_utf8 = TRUE;
	3237
	3238	/* With UCP support, we can find the other case equivalents of
	3239	the relevant characters. There may be several ranges. Optimize how
	3240	they fit with the basic range. */
	3241
	3242	#ifdef SUPPORT_UCP
	3243	if ((options & PCRE_CASELESS) != 0)
	3244	{
	3245	unsigned int occ, ocd;
	3246	unsigned int cc = c;
	3247	unsigned int origd = d;
	3248	while (get_othercase_range(&cc, origd, &occ, &ocd))
	3249	{
	3250	if (occ >= (unsigned int)c &&
	3251	ocd <= (unsigned int)d)
	3252	continue; /* Skip embedded ranges */
	3253
	3254	if (occ < (unsigned int)c &&
	3255	ocd >= (unsigned int)c - 1) /* Extend the basic range */
	3256	{ /* if there is overlap, */
	3257	c = occ; /* noting that if occ < c */
	3258	continue; /* we can't have ocd > d */
	3259	} /* because a subrange is */
	3260	if (ocd > (unsigned int)d &&
	3261	occ <= (unsigned int)d + 1) /* always shorter than */
	3262	{ /* the basic range. */
	3263	d = ocd;
	3264	continue;
	3265	}
	3266
	3267	if (occ == ocd)
	3268	{
	3269	*class_utf8data++ = XCL_SINGLE;
	3270	}
	3271	else
	3272	{
	3273	*class_utf8data++ = XCL_RANGE;
	3274	class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
	3275	}
	3276	class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
	3277	}
	3278	}
	3279	#endif /* SUPPORT_UCP */
	3280
	3281	/* Now record the original range, possibly modified for UCP caseless
	3282	overlapping ranges. */
	3283
	3284	*class_utf8data++ = XCL_RANGE;
	3285	class_utf8data += _pcre_ord2utf8(c, class_utf8data);
	3286	class_utf8data += _pcre_ord2utf8(d, class_utf8data);
	3287
	3288	/* With UCP support, we are done. Without UCP support, there is no
	3289	caseless matching for UTF-8 characters > 127; we can use the bit map
	3290	for the smaller ones. */
	3291
	3292	#ifdef SUPPORT_UCP
	3293	continue; /* With next character in the class */
	3294	#else
	3295	if ((options & PCRE_CASELESS) == 0 \|\| c > 127) continue;
	3296
	3297	/* Adjust upper limit and fall through to set up the map */
	3298
	3299	d = 127;
	3300
	3301	#endif /* SUPPORT_UCP */
	3302	}
	3303	#endif /* SUPPORT_UTF8 */
	3304
	3305	/* We use the bit map for all cases when not in UTF-8 mode; else
	3306	ranges that lie entirely within 0-127 when there is UCP support; else
	3307	for partial ranges without UCP support. */
	3308
	3309	class_charcount += d - c + 1;
	3310	class_lastchar = d;
	3311
	3312	/* We can save a bit of time by skipping this in the pre-compile. */
	3313
	3314	if (lengthptr == NULL) for (; c <= d; c++)
	3315	{
	3316	classbits[c/8] \|= (1 << (c&7));
	3317	if ((options & PCRE_CASELESS) != 0)
	3318	{
	3319	int uc = cd->fcc[c]; /* flip case */
	3320	classbits[uc/8] \|= (1 << (uc&7));
	3321	}
	3322	}
	3323
	3324	continue; /* Go get the next char in the class */
	3325	}
	3326
	3327	/* Handle a lone single character - we can get here for a normal
	3328	non-escape char, or after \ that introduces a single character or for an
	3329	apparent range that isn't. */
	3330
	3331	LONE_SINGLE_CHARACTER:
	3332
	3333	/* Handle a character that cannot go in the bit map */
	3334
	3335	#ifdef SUPPORT_UTF8
	3336	if (utf8 && (c > 255 \|\| ((options & PCRE_CASELESS) != 0 && c > 127)))
	3337	{
	3338	class_utf8 = TRUE;
	3339	*class_utf8data++ = XCL_SINGLE;
	3340	class_utf8data += _pcre_ord2utf8(c, class_utf8data);
	3341
	3342	#ifdef SUPPORT_UCP
	3343	if ((options & PCRE_CASELESS) != 0)
	3344	{
	3345	unsigned int othercase;
	3346	if ((othercase = UCD_OTHERCASE(c)) != c)
	3347	{
	3348	*class_utf8data++ = XCL_SINGLE;
	3349	class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
	3350	}
	3351	}
	3352	#endif /* SUPPORT_UCP */
	3353
	3354	}
	3355	else
	3356	#endif /* SUPPORT_UTF8 */
	3357
	3358	/* Handle a single-byte character */
	3359	{
	3360	classbits[c/8] \|= (1 << (c&7));
	3361	if ((options & PCRE_CASELESS) != 0)
	3362	{
	3363	c = cd->fcc[c]; /* flip case */
	3364	classbits[c/8] \|= (1 << (c&7));
	3365	}
	3366	class_charcount++;
	3367	class_lastchar = c;
	3368	}
	3369	}
	3370
	3371	/* Loop until ']' reached. This "while" is the end of the "do" above. */
	3372
	3373	while ((c = *(++ptr)) != 0 && (c != ']' \|\| inescq));
	3374
	3375	if (c == 0) /* Missing terminating ']' */
	3376	{
	3377	*errorcodeptr = ERR6;
	3378	goto FAILED;
	3379	}
	3380
	3381
	3382	/* This code has been disabled because it would mean that \s counts as
	3383	an explicit \r or \n reference, and that's not really what is wanted. Now
	3384	we set the flag only if there is a literal "\r" or "\n" in the class. */
	3385
	3386	#if 0
	3387	/* Remember whether \r or \n are in this class */
	3388
	3389	if (negate_class)
	3390	{
	3391	if ((classbits[1] & 0x24) != 0x24) cd->external_flags \|= PCRE_HASCRORLF;
	3392	}
	3393	else
	3394	{
	3395	if ((classbits[1] & 0x24) != 0) cd->external_flags \|= PCRE_HASCRORLF;
	3396	}
	3397	#endif
	3398
	3399
	3400	/* If class_charcount is 1, we saw precisely one character whose value is
	3401	less than 256. As long as there were no characters >= 128 and there was no
	3402	use of \p or \P, in other words, no use of any XCLASS features, we can
	3403	optimize.
	3404
	3405	In UTF-8 mode, we can optimize the negative case only if there were no
	3406	characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
	3407	operate on single-bytes only. This is an historical hangover. Maybe one day
	3408	we can tidy these opcodes to handle multi-byte characters.
	3409
	3410	The optimization throws away the bit map. We turn the item into a
	3411	1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
	3412	that OP_NOT does not support multibyte characters. In the positive case, it
	3413	can cause firstbyte to be set. Otherwise, there can be no first char if
	3414	this item is first, whatever repeat count may follow. In the case of
	3415	reqbyte, save the previous value for reinstating. */
	3416
	3417	#ifdef SUPPORT_UTF8
	3418	if (class_charcount == 1 && !class_utf8 &&
	3419	(!utf8 \|\| !negate_class \|\| class_lastchar < 128))
	3420	#else
	3421	if (class_charcount == 1)
	3422	#endif
	3423	{
	3424	zeroreqbyte = reqbyte;
	3425
	3426	/* The OP_NOT opcode works on one-byte characters only. */
	3427
	3428	if (negate_class)
	3429	{
	3430	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	3431	zerofirstbyte = firstbyte;
	3432	*code++ = OP_NOT;
	3433	*code++ = class_lastchar;
	3434	break;
	3435	}
	3436
	3437	/* For a single, positive character, get the value into mcbuffer, and
	3438	then we can handle this with the normal one-character code. */
	3439
	3440	#ifdef SUPPORT_UTF8
	3441	if (utf8 && class_lastchar > 127)
	3442	mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
	3443	else
	3444	#endif
	3445	{
	3446	mcbuffer[0] = class_lastchar;
	3447	mclength = 1;
	3448	}
	3449	goto ONE_CHAR;
	3450	} /* End of 1-char optimization */
	3451
	3452	/* The general case - not the one-char optimization. If this is the first
	3453	thing in the branch, there can be no first char setting, whatever the
	3454	repeat count. Any reqbyte setting must remain unchanged after any kind of
	3455	repeat. */
	3456
	3457	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	3458	zerofirstbyte = firstbyte;
	3459	zeroreqbyte = reqbyte;
	3460
	3461	/* If there are characters with values > 255, we have to compile an
	3462	extended class, with its own opcode, unless there was a negated special
	3463	such as \S in the class, because in that case all characters > 255 are in
	3464	the class, so any that were explicitly given as well can be ignored. If
	3465	(when there are explicit characters > 255 that must be listed) there are no
	3466	characters < 256, we can omit the bitmap in the actual compiled code. */
	3467
	3468	#ifdef SUPPORT_UTF8
	3469	if (class_utf8 && !should_flip_negation)
	3470	{
	3471	class_utf8data++ = XCL_END; / Marks the end of extra data */
	3472	*code++ = OP_XCLASS;
	3473	code += LINK_SIZE;
	3474	*code = negate_class? XCL_NOT : 0;
	3475
	3476	/* If the map is required, move up the extra data to make room for it;
	3477	otherwise just move the code pointer to the end of the extra data. */
	3478
	3479	if (class_charcount > 0)
	3480	{
	3481	*code++ \|= XCL_MAP;
	3482	memmove(code + 32, code, class_utf8data - code);
	3483	memcpy(code, classbits, 32);
	3484	code = class_utf8data + 32;
	3485	}
	3486	else code = class_utf8data;
	3487
	3488	/* Now fill in the complete length of the item */
	3489
	3490	PUT(previous, 1, code - previous);
	3491	break; /* End of class handling */
	3492	}
	3493	#endif
	3494
	3495	/* If there are no characters > 255, set the opcode to OP_CLASS or
	3496	OP_NCLASS, depending on whether the whole class was negated and whether
	3497	there were negative specials such as \S in the class. Then copy the 32-byte
	3498	map into the code vector, negating it if necessary. */
	3499
	3500	*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
	3501	if (negate_class)
	3502	{
	3503	if (lengthptr == NULL) /* Save time in the pre-compile phase */
	3504	for (c = 0; c < 32; c++) code[c] = ~classbits[c];
	3505	}
	3506	else
	3507	{
	3508	memcpy(code, classbits, 32);
	3509	}
	3510	code += 32;
	3511	break;
	3512
	3513
	3514	/* ===================================================================*/
	3515	/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
	3516	has been tested above. */
	3517
	3518	case '{':
	3519	if (!is_quantifier) goto NORMAL_CHAR;
	3520	ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
	3521	if (*errorcodeptr != 0) goto FAILED;
	3522	goto REPEAT;
	3523
	3524	case '*':
	3525	repeat_min = 0;
	3526	repeat_max = -1;
	3527	goto REPEAT;
	3528
	3529	case '+':
	3530	repeat_min = 1;
	3531	repeat_max = -1;
	3532	goto REPEAT;
	3533
	3534	case '?':
	3535	repeat_min = 0;
	3536	repeat_max = 1;
	3537
	3538	REPEAT:
	3539	if (previous == NULL)
	3540	{
	3541	*errorcodeptr = ERR9;
	3542	goto FAILED;
	3543	}
	3544
	3545	if (repeat_min == 0)
	3546	{
	3547	firstbyte = zerofirstbyte; /* Adjust for zero repeat */
	3548	reqbyte = zeroreqbyte; /* Ditto */
	3549	}
	3550
	3551	/* Remember whether this is a variable length repeat */
	3552
	3553	reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
	3554
	3555	op_type = 0; /* Default single-char op codes */
	3556	possessive_quantifier = FALSE; /* Default not possessive quantifier */
	3557
	3558	/* Save start of previous item, in case we have to move it up to make space
	3559	for an inserted OP_ONCE for the additional '+' extension. */
	3560
	3561	tempcode = previous;
	3562
	3563	/* If the next character is '+', we have a possessive quantifier. This
	3564	implies greediness, whatever the setting of the PCRE_UNGREEDY option.
	3565	If the next character is '?' this is a minimizing repeat, by default,
	3566	but if PCRE_UNGREEDY is set, it works the other way round. We change the
	3567	repeat type to the non-default. */
	3568
	3569	if (ptr[1] == '+')
	3570	{
	3571	repeat_type = 0; /* Force greedy */
	3572	possessive_quantifier = TRUE;
	3573	ptr++;
	3574	}
	3575	else if (ptr[1] == '?')
	3576	{
	3577	repeat_type = greedy_non_default;
	3578	ptr++;
	3579	}
	3580	else repeat_type = greedy_default;
	3581
	3582	/* If previous was a character match, abolish the item and generate a
	3583	repeat item instead. If a char item has a minumum of more than one, ensure
	3584	that it is set in reqbyte - it might not be if a sequence such as x{3} is
	3585	the first thing in a branch because the x will have gone into firstbyte
	3586	instead. */
	3587
	3588	if (previous == OP_CHAR \|\| previous == OP_CHARNC)
	3589	{
	3590	/* Deal with UTF-8 characters that take up more than one byte. It's
	3591	easier to write this out separately than try to macrify it. Use c to
	3592	hold the length of the character in bytes, plus 0x80 to flag that it's a
	3593	length rather than a small character. */
	3594
	3595	#ifdef SUPPORT_UTF8
	3596	if (utf8 && (code[-1] & 0x80) != 0)
	3597	{
	3598	uschar *lastchar = code - 1;
	3599	while((*lastchar & 0xc0) == 0x80) lastchar--;
	3600	c = code - lastchar; /* Length of UTF-8 character */
	3601	memcpy(utf8_char, lastchar, c); /* Save the char */
	3602	c \|= 0x80; /* Flag c as a length */
	3603	}
	3604	else
	3605	#endif
	3606
	3607	/* Handle the case of a single byte - either with no UTF8 support, or
	3608	with UTF-8 disabled, or for a UTF-8 character < 128. */
	3609
	3610	{
	3611	c = code[-1];
	3612	if (repeat_min > 1) reqbyte = c \| req_caseopt \| cd->req_varyopt;
	3613	}
	3614
	3615	/* If the repetition is unlimited, it pays to see if the next thing on
	3616	the line is something that cannot possibly match this character. If so,
	3617	automatically possessifying this item gains some performance in the case
	3618	where the match fails. */
	3619
	3620	if (!possessive_quantifier &&
	3621	repeat_max < 0 &&
	3622	check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
	3623	options, cd))
	3624	{
	3625	repeat_type = 0; /* Force greedy */
	3626	possessive_quantifier = TRUE;
	3627	}
	3628
	3629	goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
	3630	}
	3631
	3632	/* If previous was a single negated character ([^a] or similar), we use
	3633	one of the special opcodes, replacing it. The code is shared with single-
	3634	character repeats by setting opt_type to add a suitable offset into
	3635	repeat_type. We can also test for auto-possessification. OP_NOT is
	3636	currently used only for single-byte chars. */
	3637
	3638	else if (*previous == OP_NOT)
	3639	{
	3640	op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
	3641	c = previous[1];
	3642	if (!possessive_quantifier &&
	3643	repeat_max < 0 &&
	3644	check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
	3645	{
	3646	repeat_type = 0; /* Force greedy */
	3647	possessive_quantifier = TRUE;
	3648	}
	3649	goto OUTPUT_SINGLE_REPEAT;
	3650	}
	3651
	3652	/* If previous was a character type match (\d or similar), abolish it and
	3653	create a suitable repeat item. The code is shared with single-character
	3654	repeats by setting op_type to add a suitable offset into repeat_type. Note
	3655	the the Unicode property types will be present only when SUPPORT_UCP is
	3656	defined, but we don't wrap the little bits of code here because it just
	3657	makes it horribly messy. */
	3658
	3659	else if (*previous < OP_EODN)
	3660	{
	3661	uschar *oldcode;
	3662	int prop_type, prop_value;
	3663	op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
	3664	c = *previous;
	3665
	3666	if (!possessive_quantifier &&
	3667	repeat_max < 0 &&
	3668	check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
	3669	{
	3670	repeat_type = 0; /* Force greedy */
	3671	possessive_quantifier = TRUE;
	3672	}
	3673
	3674	OUTPUT_SINGLE_REPEAT:
	3675	if (previous == OP_PROP \|\| previous == OP_NOTPROP)
	3676	{
	3677	prop_type = previous[1];
	3678	prop_value = previous[2];
	3679	}
	3680	else prop_type = prop_value = -1;
	3681
	3682	oldcode = code;
	3683	code = previous; /* Usually overwrite previous item */
	3684
	3685	/* If the maximum is zero then the minimum must also be zero; Perl allows
	3686	this case, so we do too - by simply omitting the item altogether. */
	3687
	3688	if (repeat_max == 0) goto END_REPEAT;
	3689
	3690	/* All real repeats make it impossible to handle partial matching (maybe
	3691	one day we will be able to remove this restriction). */
	3692
	3693	if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL;
	3694
	3695	/* Combine the op_type with the repeat_type */
	3696
	3697	repeat_type += op_type;
	3698
	3699	/* A minimum of zero is handled either as the special case * or ?, or as
	3700	an UPTO, with the maximum given. */
	3701
	3702	if (repeat_min == 0)
	3703	{
	3704	if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
	3705	else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
	3706	else
	3707	{
	3708	*code++ = OP_UPTO + repeat_type;
	3709	PUT2INC(code, 0, repeat_max);
	3710	}
	3711	}
	3712
	3713	/* A repeat minimum of 1 is optimized into some special cases. If the
	3714	maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
	3715	left in place and, if the maximum is greater than 1, we use OP_UPTO with
	3716	one less than the maximum. */
	3717
	3718	else if (repeat_min == 1)
	3719	{
	3720	if (repeat_max == -1)
	3721	*code++ = OP_PLUS + repeat_type;
	3722	else
	3723	{
	3724	code = oldcode; /* leave previous item in place */
	3725	if (repeat_max == 1) goto END_REPEAT;
	3726	*code++ = OP_UPTO + repeat_type;
	3727	PUT2INC(code, 0, repeat_max - 1);
	3728	}
	3729	}
	3730
	3731	/* The case {n,n} is just an EXACT, while the general case {n,m} is
	3732	handled as an EXACT followed by an UPTO. */
	3733
	3734	else
	3735	{
	3736	code++ = OP_EXACT + op_type; / NB EXACT doesn't have repeat_type */
	3737	PUT2INC(code, 0, repeat_min);
	3738
	3739	/* If the maximum is unlimited, insert an OP_STAR. Before doing so,
	3740	we have to insert the character for the previous code. For a repeated
	3741	Unicode property match, there are two extra bytes that define the
	3742	required property. In UTF-8 mode, long characters have their length in
	3743	c, with the 0x80 bit as a flag. */
	3744
	3745	if (repeat_max < 0)
	3746	{
	3747	#ifdef SUPPORT_UTF8
	3748	if (utf8 && c >= 128)
	3749	{
	3750	memcpy(code, utf8_char, c & 7);
	3751	code += c & 7;
	3752	}
	3753	else
	3754	#endif
	3755	{
	3756	*code++ = c;
	3757	if (prop_type >= 0)
	3758	{
	3759	*code++ = prop_type;
	3760	*code++ = prop_value;
	3761	}
	3762	}
	3763	*code++ = OP_STAR + repeat_type;
	3764	}
	3765
	3766	/* Else insert an UPTO if the max is greater than the min, again
	3767	preceded by the character, for the previously inserted code. If the
	3768	UPTO is just for 1 instance, we can use QUERY instead. */
	3769
	3770	else if (repeat_max != repeat_min)
	3771	{
	3772	#ifdef SUPPORT_UTF8
	3773	if (utf8 && c >= 128)
	3774	{
	3775	memcpy(code, utf8_char, c & 7);
	3776	code += c & 7;
	3777	}
	3778	else
	3779	#endif
	3780	*code++ = c;
	3781	if (prop_type >= 0)
	3782	{
	3783	*code++ = prop_type;
	3784	*code++ = prop_value;
	3785	}
	3786	repeat_max -= repeat_min;
	3787
	3788	if (repeat_max == 1)
	3789	{
	3790	*code++ = OP_QUERY + repeat_type;
	3791	}
	3792	else
	3793	{
	3794	*code++ = OP_UPTO + repeat_type;
	3795	PUT2INC(code, 0, repeat_max);
	3796	}
	3797	}
	3798	}
	3799
	3800	/* The character or character type itself comes last in all cases. */
	3801
	3802	#ifdef SUPPORT_UTF8
	3803	if (utf8 && c >= 128)
	3804	{
	3805	memcpy(code, utf8_char, c & 7);
	3806	code += c & 7;
	3807	}
	3808	else
	3809	#endif
	3810	*code++ = c;
	3811
	3812	/* For a repeated Unicode property match, there are two extra bytes that
	3813	define the required property. */
	3814
	3815	#ifdef SUPPORT_UCP
	3816	if (prop_type >= 0)
	3817	{
	3818	*code++ = prop_type;
	3819	*code++ = prop_value;
	3820	}
	3821	#endif
	3822	}
	3823
	3824	/* If previous was a character class or a back reference, we put the repeat
	3825	stuff after it, but just skip the item if the repeat was {0,0}. */
	3826
	3827	else if (*previous == OP_CLASS \|\|
	3828	*previous == OP_NCLASS \|\|
	3829	#ifdef SUPPORT_UTF8
	3830	*previous == OP_XCLASS \|\|
	3831	#endif
	3832	*previous == OP_REF)
	3833	{
	3834	if (repeat_max == 0)
	3835	{
	3836	code = previous;
	3837	goto END_REPEAT;
	3838	}
	3839
	3840	/* All real repeats make it impossible to handle partial matching (maybe
	3841	one day we will be able to remove this restriction). */
	3842
	3843	if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL;
	3844
	3845	if (repeat_min == 0 && repeat_max == -1)
	3846	*code++ = OP_CRSTAR + repeat_type;
	3847	else if (repeat_min == 1 && repeat_max == -1)
	3848	*code++ = OP_CRPLUS + repeat_type;
	3849	else if (repeat_min == 0 && repeat_max == 1)
	3850	*code++ = OP_CRQUERY + repeat_type;
	3851	else
	3852	{
	3853	*code++ = OP_CRRANGE + repeat_type;
	3854	PUT2INC(code, 0, repeat_min);
	3855	if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
	3856	PUT2INC(code, 0, repeat_max);
	3857	}
	3858	}
	3859
	3860	/* If previous was a bracket group, we may have to replicate it in certain
	3861	cases. */
	3862
	3863	else if (previous == OP_BRA \|\| previous == OP_CBRA \|\|
	3864	previous == OP_ONCE \|\| previous == OP_COND)
	3865	{
	3866	register int i;
	3867	int ketoffset = 0;
	3868	int len = code - previous;
	3869	uschar *bralink = NULL;
	3870
	3871	/* Repeating a DEFINE group is pointless */
	3872
	3873	if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
	3874	{
	3875	*errorcodeptr = ERR55;
	3876	goto FAILED;
	3877	}
	3878
	3879	/* If the maximum repeat count is unlimited, find the end of the bracket
	3880	by scanning through from the start, and compute the offset back to it
	3881	from the current code pointer. There may be an OP_OPT setting following
	3882	the final KET, so we can't find the end just by going back from the code
	3883	pointer. */
	3884
	3885	if (repeat_max == -1)
	3886	{
	3887	register uschar *ket = previous;
	3888	do ket += GET(ket, 1); while (*ket != OP_KET);
	3889	ketoffset = code - ket;
	3890	}
	3891
	3892	/* The case of a zero minimum is special because of the need to stick
	3893	OP_BRAZERO in front of it, and because the group appears once in the
	3894	data, whereas in other cases it appears the minimum number of times. For
	3895	this reason, it is simplest to treat this case separately, as otherwise
	3896	the code gets far too messy. There are several special subcases when the
	3897	minimum is zero. */
	3898
	3899	if (repeat_min == 0)
	3900	{
	3901	/* If the maximum is also zero, we used to just omit the group from the
	3902	output altogether, like this:
	3903
	3904	** if (repeat_max == 0)
	3905	** {
	3906	** code = previous;
	3907	** goto END_REPEAT;
	3908	** }
	3909
	3910	However, that fails when a group is referenced as a subroutine from
	3911	elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
	3912	so that it is skipped on execution. As we don't have a list of which
	3913	groups are referenced, we cannot do this selectively.
	3914
	3915	If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
	3916	and do no more at this point. However, we do need to adjust any
	3917	OP_RECURSE calls inside the group that refer to the group itself or any
	3918	internal or forward referenced group, because the offset is from the
	3919	start of the whole regex. Temporarily terminate the pattern while doing
	3920	this. */
	3921
	3922	if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
	3923	{
	3924	*code = OP_END;
	3925	adjust_recurse(previous, 1, utf8, cd, save_hwm);
	3926	memmove(previous+1, previous, len);
	3927	code++;
	3928	if (repeat_max == 0)
	3929	{
	3930	*previous++ = OP_SKIPZERO;
	3931	goto END_REPEAT;
	3932	}
	3933	*previous++ = OP_BRAZERO + repeat_type;
	3934	}
	3935
	3936	/* If the maximum is greater than 1 and limited, we have to replicate
	3937	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
	3938	The first one has to be handled carefully because it's the original
	3939	copy, which has to be moved up. The remainder can be handled by code
	3940	that is common with the non-zero minimum case below. We have to
	3941	adjust the value or repeat_max, since one less copy is required. Once
	3942	again, we may have to adjust any OP_RECURSE calls inside the group. */
	3943
	3944	else
	3945	{
	3946	int offset;
	3947	*code = OP_END;
	3948	adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
	3949	memmove(previous + 2 + LINK_SIZE, previous, len);
	3950	code += 2 + LINK_SIZE;
	3951	*previous++ = OP_BRAZERO + repeat_type;
	3952	*previous++ = OP_BRA;
	3953
	3954	/* We chain together the bracket offset fields that have to be
	3955	filled in later when the ends of the brackets are reached. */
	3956
	3957	offset = (bralink == NULL)? 0 : previous - bralink;
	3958	bralink = previous;
	3959	PUTINC(previous, 0, offset);
	3960	}
	3961
	3962	repeat_max--;
	3963	}
	3964
	3965	/* If the minimum is greater than zero, replicate the group as many
	3966	times as necessary, and adjust the maximum to the number of subsequent
	3967	copies that we need. If we set a first char from the group, and didn't
	3968	set a required char, copy the latter from the former. If there are any
	3969	forward reference subroutine calls in the group, there will be entries on
	3970	the workspace list; replicate these with an appropriate increment. */
	3971
	3972	else
	3973	{
	3974	if (repeat_min > 1)
	3975	{
	3976	/* In the pre-compile phase, we don't actually do the replication. We
	3977	just adjust the length as if we had. Do some paranoid checks for
	3978	potential integer overflow. */
	3979
	3980	if (lengthptr != NULL)
	3981	{
	3982	int delta = (repeat_min - 1)*length_prevgroup;
	3983	if ((double)(repeat_min - 1)*(double)length_prevgroup >
	3984	(double)INT_MAX \|\|
	3985	OFLOW_MAX - *lengthptr < delta)
	3986	{
	3987	*errorcodeptr = ERR20;
	3988	goto FAILED;
	3989	}
	3990	*lengthptr += delta;
	3991	}
	3992
	3993	/* This is compiling for real */
	3994
	3995	else
	3996	{
	3997	if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
	3998	for (i = 1; i < repeat_min; i++)
	3999	{
	4000	uschar *hc;
	4001	uschar *this_hwm = cd->hwm;
	4002	memcpy(code, previous, len);
	4003	for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
	4004	{
	4005	PUT(cd->hwm, 0, GET(hc, 0) + len);
	4006	cd->hwm += LINK_SIZE;
	4007	}
	4008	save_hwm = this_hwm;
	4009	code += len;
	4010	}
	4011	}
	4012	}
	4013
	4014	if (repeat_max > 0) repeat_max -= repeat_min;
	4015	}
	4016
	4017	/* This code is common to both the zero and non-zero minimum cases. If
	4018	the maximum is limited, it replicates the group in a nested fashion,
	4019	remembering the bracket starts on a stack. In the case of a zero minimum,
	4020	the first one was set up above. In all cases the repeat_max now specifies
	4021	the number of additional copies needed. Again, we must remember to
	4022	replicate entries on the forward reference list. */
	4023
	4024	if (repeat_max >= 0)
	4025	{
	4026	/* In the pre-compile phase, we don't actually do the replication. We
	4027	just adjust the length as if we had. For each repetition we must add 1
	4028	to the length for BRAZERO and for all but the last repetition we must
	4029	add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
	4030	paranoid checks to avoid integer overflow. */
	4031
	4032	if (lengthptr != NULL && repeat_max > 0)
	4033	{
	4034	int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
	4035	2 - 2LINK_SIZE; / Last one doesn't nest */
	4036	if ((double)repeat_max *
	4037	(double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
	4038	> (double)INT_MAX \|\|
	4039	OFLOW_MAX - *lengthptr < delta)
	4040	{
	4041	*errorcodeptr = ERR20;
	4042	goto FAILED;
	4043	}
	4044	*lengthptr += delta;
	4045	}
	4046
	4047	/* This is compiling for real */
	4048
	4049	else for (i = repeat_max - 1; i >= 0; i--)
	4050	{
	4051	uschar *hc;
	4052	uschar *this_hwm = cd->hwm;
	4053
	4054	*code++ = OP_BRAZERO + repeat_type;
	4055
	4056	/* All but the final copy start a new nesting, maintaining the
	4057	chain of brackets outstanding. */
	4058
	4059	if (i != 0)
	4060	{
	4061	int offset;
	4062	*code++ = OP_BRA;
	4063	offset = (bralink == NULL)? 0 : code - bralink;
	4064	bralink = code;
	4065	PUTINC(code, 0, offset);
	4066	}
	4067
	4068	memcpy(code, previous, len);
	4069	for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
	4070	{
	4071	PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
	4072	cd->hwm += LINK_SIZE;
	4073	}
	4074	save_hwm = this_hwm;
	4075	code += len;
	4076	}
	4077
	4078	/* Now chain through the pending brackets, and fill in their length
	4079	fields (which are holding the chain links pro tem). */
	4080
	4081	while (bralink != NULL)
	4082	{
	4083	int oldlinkoffset;
	4084	int offset = code - bralink + 1;
	4085	uschar *bra = code - offset;
	4086	oldlinkoffset = GET(bra, 1);
	4087	bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
	4088	*code++ = OP_KET;
	4089	PUTINC(code, 0, offset);
	4090	PUT(bra, 1, offset);
	4091	}
	4092	}
	4093
	4094	/* If the maximum is unlimited, set a repeater in the final copy. We
	4095	can't just offset backwards from the current code point, because we
	4096	don't know if there's been an options resetting after the ket. The
	4097	correct offset was computed above.
	4098
	4099	Then, when we are doing the actual compile phase, check to see whether
	4100	this group is a non-atomic one that could match an empty string. If so,
	4101	convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
	4102	that runtime checking can be done. [This check is also applied to
	4103	atomic groups at runtime, but in a different way.] */
	4104
	4105	else
	4106	{
	4107	uschar *ketcode = code - ketoffset;
	4108	uschar *bracode = ketcode - GET(ketcode, 1);
	4109	*ketcode = OP_KETRMAX + repeat_type;
	4110	if (lengthptr == NULL && *bracode != OP_ONCE)
	4111	{
	4112	uschar *scode = bracode;
	4113	do
	4114	{
	4115	if (could_be_empty_branch(scode, ketcode, utf8))
	4116	{
	4117	*bracode += OP_SBRA - OP_BRA;
	4118	break;
	4119	}
	4120	scode += GET(scode, 1);
	4121	}
	4122	while (*scode == OP_ALT);
	4123	}
	4124	}
	4125	}
	4126
	4127	/* If previous is OP_FAIL, it was generated by an empty class [] in
	4128	JavaScript mode. The other ways in which OP_FAIL can be generated, that is
	4129	by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
	4130	error above. We can just ignore the repeat in JS case. */
	4131
	4132	else if (*previous == OP_FAIL) goto END_REPEAT;
	4133
	4134	/* Else there's some kind of shambles */
	4135
	4136	else
	4137	{
	4138	*errorcodeptr = ERR11;
	4139	goto FAILED;
	4140	}
	4141
	4142	/* If the character following a repeat is '+', or if certain optimization
	4143	tests above succeeded, possessive_quantifier is TRUE. For some of the
	4144	simpler opcodes, there is an special alternative opcode for this. For
	4145	anything else, we wrap the entire repeated item inside OP_ONCE brackets.
	4146	The '+' notation is just syntactic sugar, taken from Sun's Java package,
	4147	but the special opcodes can optimize it a bit. The repeated item starts at
	4148	tempcode, not at previous, which might be the first part of a string whose
	4149	(former) last char we repeated.
	4150
	4151	Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
	4152	an 'upto' may follow. We skip over an 'exact' item, and then test the
	4153	length of what remains before proceeding. */
	4154
	4155	if (possessive_quantifier)
	4156	{
	4157	int len;
	4158	if (tempcode == OP_EXACT \|\| tempcode == OP_TYPEEXACT \|\|
	4159	*tempcode == OP_NOTEXACT)
	4160	tempcode += _pcre_OP_lengths[*tempcode] +
	4161	((*tempcode == OP_TYPEEXACT &&
	4162	(tempcode[3] == OP_PROP \|\| tempcode[3] == OP_NOTPROP))? 2:0);
	4163	len = code - tempcode;
	4164	if (len > 0) switch (*tempcode)
	4165	{
	4166	case OP_STAR: *tempcode = OP_POSSTAR; break;
	4167	case OP_PLUS: *tempcode = OP_POSPLUS; break;
	4168	case OP_QUERY: *tempcode = OP_POSQUERY; break;
	4169	case OP_UPTO: *tempcode = OP_POSUPTO; break;
	4170
	4171	case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
	4172	case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
	4173	case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
	4174	case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
	4175
	4176	case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
	4177	case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
	4178	case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
	4179	case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
	4180
	4181	default:
	4182	memmove(tempcode + 1+LINK_SIZE, tempcode, len);
	4183	code += 1 + LINK_SIZE;
	4184	len += 1 + LINK_SIZE;
	4185	tempcode[0] = OP_ONCE;
	4186	*code++ = OP_KET;
	4187	PUTINC(code, 0, len);
	4188	PUT(tempcode, 1, len);
	4189	break;
	4190	}
	4191	}
	4192
	4193	/* In all case we no longer have a previous item. We also set the
	4194	"follows varying string" flag for subsequently encountered reqbytes if
	4195	it isn't already set and we have just passed a varying length item. */
	4196
	4197	END_REPEAT:
	4198	previous = NULL;
	4199	cd->req_varyopt \|= reqvary;
	4200	break;
	4201
	4202
	4203	/* ===================================================================*/
	4204	/* Start of nested parenthesized sub-expression, or comment or lookahead or
	4205	lookbehind or option setting or condition or all the other extended
	4206	parenthesis forms. */
	4207
	4208	case '(':
	4209	newoptions = options;
	4210	skipbytes = 0;
	4211	bravalue = OP_CBRA;
	4212	save_hwm = cd->hwm;
	4213	reset_bracount = FALSE;
	4214
	4215	/* First deal with various "verbs" that can be introduced by ''. /
	4216
	4217	if ((++ptr) == '' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
	4218	{
	4219	int i, namelen;
	4220	const char *vn = verbnames;
	4221	const uschar *name = ++ptr;
	4222	previous = NULL;
	4223	while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
	4224	if (*ptr == ':')
	4225	{
	4226	errorcodeptr = ERR59; / Not supported */
	4227	goto FAILED;
	4228	}
	4229	if (*ptr != ')')
	4230	{
	4231	*errorcodeptr = ERR60;
	4232	goto FAILED;
	4233	}
	4234	namelen = ptr - name;
	4235	for (i = 0; i < verbcount; i++)
	4236	{
	4237	if (namelen == verbs[i].len &&
	4238	strncmp((char *)name, vn, namelen) == 0)
	4239	{
	4240	*code = verbs[i].op;
	4241	if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
	4242	break;
	4243	}
	4244	vn += verbs[i].len + 1;
	4245	}
	4246	if (i < verbcount) continue;
	4247	*errorcodeptr = ERR60;
	4248	goto FAILED;
	4249	}
	4250
	4251	/* Deal with the extended parentheses; all are introduced by '?', and the
	4252	appearance of any of them means that this is not a capturing group. */
	4253
	4254	else if (*ptr == '?')
	4255	{
	4256	int i, set, unset, namelen;
	4257	int *optset;
	4258	const uschar *name;
	4259	uschar *slot;
	4260
	4261	switch (*(++ptr))
	4262	{
	4263	case '#': /* Comment; skip to ket */
	4264	ptr++;
	4265	while (ptr != 0 && ptr != ')') ptr++;
	4266	if (*ptr == 0)
	4267	{
	4268	*errorcodeptr = ERR18;
	4269	goto FAILED;
	4270	}
	4271	continue;
	4272
	4273
	4274	/* ------------------------------------------------------------ */
	4275	case '\|': /* Reset capture count for each branch */
	4276	reset_bracount = TRUE;
	4277	/* Fall through */
	4278
	4279	/* ------------------------------------------------------------ */
	4280	case ':': /* Non-capturing bracket */
	4281	bravalue = OP_BRA;
	4282	ptr++;
	4283	break;
	4284
	4285
	4286	/* ------------------------------------------------------------ */
	4287	case '(':
	4288	bravalue = OP_COND; /* Conditional group */
	4289
	4290	/* A condition can be an assertion, a number (referring to a numbered
	4291	group), a name (referring to a named group), or 'R', referring to
	4292	recursion. R<digits> and R&name are also permitted for recursion tests.
	4293
	4294	There are several syntaxes for testing a named group: (?(name)) is used
	4295	by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
	4296
	4297	There are two unfortunate ambiguities, caused by history. (a) 'R' can
	4298	be the recursive thing or the name 'R' (and similarly for 'R' followed
	4299	by digits), and (b) a number could be a name that consists of digits.
	4300	In both cases, we look for a name first; if not found, we try the other
	4301	cases. */
	4302
	4303	/* For conditions that are assertions, check the syntax, and then exit
	4304	the switch. This will take control down to where bracketed groups,
	4305	including assertions, are processed. */
	4306
	4307	if (ptr[1] == '?' && (ptr[2] == '=' \|\| ptr[2] == '!' \|\| ptr[2] == '<'))
	4308	break;
	4309
	4310	/* Most other conditions use OP_CREF (a couple change to OP_RREF
	4311	below), and all need to skip 3 bytes at the start of the group. */
	4312
	4313	code[1+LINK_SIZE] = OP_CREF;
	4314	skipbytes = 3;
	4315	refsign = -1;
	4316
	4317	/* Check for a test for recursion in a named group. */
	4318
	4319	if (ptr[1] == 'R' && ptr[2] == '&')
	4320	{
	4321	terminator = -1;
	4322	ptr += 2;
	4323	code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
	4324	}
	4325
	4326	/* Check for a test for a named group's having been set, using the Perl
	4327	syntax (?(<name>) or (?('name') */
	4328
	4329	else if (ptr[1] == '<')
	4330	{
	4331	terminator = '>';
	4332	ptr++;
	4333	}
	4334	else if (ptr[1] == '\'')
	4335	{
	4336	terminator = '\'';
	4337	ptr++;
	4338	}
	4339	else
	4340	{
	4341	terminator = 0;
	4342	if (ptr[1] == '-' \|\| ptr[1] == '+') refsign = *(++ptr);
	4343	}
	4344
	4345	/* We now expect to read a name; any thing else is an error */
	4346
	4347	if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
	4348	{
	4349	ptr += 1; /* To get the right offset */
	4350	*errorcodeptr = ERR28;
	4351	goto FAILED;
	4352	}
	4353
	4354	/* Read the name, but also get it as a number if it's all digits */
	4355
	4356	recno = 0;
	4357	name = ++ptr;
	4358	while ((cd->ctypes[*ptr] & ctype_word) != 0)
	4359	{
	4360	if (recno >= 0)
	4361	recno = ((digitab[*ptr] & ctype_digit) != 0)?
	4362	recno * 10 + *ptr - '0' : -1;
	4363	ptr++;
	4364	}
	4365	namelen = ptr - name;
	4366
	4367	if ((terminator > 0 && ptr++ != terminator) \|\| ptr++ != ')')
	4368	{
	4369	ptr--; /* Error offset */
	4370	*errorcodeptr = ERR26;
	4371	goto FAILED;
	4372	}
	4373
	4374	/* Do no further checking in the pre-compile phase. */
	4375
	4376	if (lengthptr != NULL) break;
	4377
	4378	/* In the real compile we do the work of looking for the actual
	4379	reference. If the string started with "+" or "-" we require the rest to
	4380	be digits, in which case recno will be set. */
	4381
	4382	if (refsign > 0)
	4383	{
	4384	if (recno <= 0)
	4385	{
	4386	*errorcodeptr = ERR58;
	4387	goto FAILED;
	4388	}
	4389	recno = (refsign == '-')?
	4390	cd->bracount - recno + 1 : recno +cd->bracount;
	4391	if (recno <= 0 \|\| recno > cd->final_bracount)
	4392	{
	4393	*errorcodeptr = ERR15;
	4394	goto FAILED;
	4395	}
	4396	PUT2(code, 2+LINK_SIZE, recno);
	4397	break;
	4398	}
	4399
	4400	/* Otherwise (did not start with "+" or "-"), start by looking for the
	4401	name. */
	4402
	4403	slot = cd->name_table;
	4404	for (i = 0; i < cd->names_found; i++)
	4405	{
	4406	if (strncmp((char )name, (char )slot+2, namelen) == 0) break;
	4407	slot += cd->name_entry_size;
	4408	}
	4409
	4410	/* Found a previous named subpattern */
	4411
	4412	if (i < cd->names_found)
	4413	{
	4414	recno = GET2(slot, 0);
	4415	PUT2(code, 2+LINK_SIZE, recno);
	4416	}
	4417
	4418	/* Search the pattern for a forward reference */
	4419
	4420	else if ((i = find_parens(ptr, cd, name, namelen,
	4421	(options & PCRE_EXTENDED) != 0)) > 0)
	4422	{
	4423	PUT2(code, 2+LINK_SIZE, i);
	4424	}
	4425
	4426	/* If terminator == 0 it means that the name followed directly after
	4427	the opening parenthesis [e.g. (?(abc)...] and in this case there are
	4428	some further alternatives to try. For the cases where terminator != 0
	4429	[things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
	4430	now checked all the possibilities, so give an error. */
	4431
	4432	else if (terminator != 0)
	4433	{
	4434	*errorcodeptr = ERR15;
	4435	goto FAILED;
	4436	}
	4437
	4438	/* Check for (?(R) for recursion. Allow digits after R to specify a
	4439	specific group number. */
	4440
	4441	else if (*name == 'R')
	4442	{
	4443	recno = 0;
	4444	for (i = 1; i < namelen; i++)
	4445	{
	4446	if ((digitab[name[i]] & ctype_digit) == 0)
	4447	{
	4448	*errorcodeptr = ERR15;
	4449	goto FAILED;
	4450	}
	4451	recno = recno * 10 + name[i] - '0';
	4452	}
	4453	if (recno == 0) recno = RREF_ANY;
	4454	code[1+LINK_SIZE] = OP_RREF; /* Change test type */
	4455	PUT2(code, 2+LINK_SIZE, recno);
	4456	}
	4457
	4458	/* Similarly, check for the (?(DEFINE) "condition", which is always
	4459	false. */
	4460
	4461	else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
	4462	{
	4463	code[1+LINK_SIZE] = OP_DEF;
	4464	skipbytes = 1;
	4465	}
	4466
	4467	/* Check for the "name" actually being a subpattern number. We are
	4468	in the second pass here, so final_bracount is set. */
	4469
	4470	else if (recno > 0 && recno <= cd->final_bracount)
	4471	{
	4472	PUT2(code, 2+LINK_SIZE, recno);
	4473	}
	4474
	4475	/* Either an unidentified subpattern, or a reference to (?(0) */
	4476
	4477	else
	4478	{
	4479	*errorcodeptr = (recno == 0)? ERR35: ERR15;
	4480	goto FAILED;
	4481	}
	4482	break;
	4483
	4484
	4485	/* ------------------------------------------------------------ */
	4486	case '=': /* Positive lookahead */
	4487	bravalue = OP_ASSERT;
	4488	ptr++;
	4489	break;
	4490
	4491
	4492	/* ------------------------------------------------------------ */
	4493	case '!': /* Negative lookahead */
	4494	ptr++;
	4495	if (ptr == ')') / Optimize (?!) */
	4496	{
	4497	*code++ = OP_FAIL;
	4498	previous = NULL;
	4499	continue;
	4500	}
	4501	bravalue = OP_ASSERT_NOT;
	4502	break;
	4503
	4504
	4505	/* ------------------------------------------------------------ */
	4506	case '<': /* Lookbehind or named define */
	4507	switch (ptr[1])
	4508	{
	4509	case '=': /* Positive lookbehind */
	4510	bravalue = OP_ASSERTBACK;
	4511	ptr += 2;
	4512	break;
	4513
	4514	case '!': /* Negative lookbehind */
	4515	bravalue = OP_ASSERTBACK_NOT;
	4516	ptr += 2;
	4517	break;
	4518
	4519	default: /* Could be name define, else bad */
	4520	if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
	4521	ptr++; /* Correct offset for error */
	4522	*errorcodeptr = ERR24;
	4523	goto FAILED;
	4524	}
	4525	break;
	4526
	4527
	4528	/* ------------------------------------------------------------ */
	4529	case '>': /* One-time brackets */
	4530	bravalue = OP_ONCE;
	4531	ptr++;
	4532	break;
	4533
	4534
	4535	/* ------------------------------------------------------------ */
	4536	case 'C': /* Callout - may be followed by digits; */
	4537	previous_callout = code; /* Save for later completion */
	4538	after_manual_callout = 1; /* Skip one item before completing */
	4539	*code++ = OP_CALLOUT;
	4540	{
	4541	int n = 0;
	4542	while ((digitab[*(++ptr)] & ctype_digit) != 0)
	4543	n = n * 10 + *ptr - '0';
	4544	if (*ptr != ')')
	4545	{
	4546	*errorcodeptr = ERR39;
	4547	goto FAILED;
	4548	}
	4549	if (n > 255)
	4550	{
	4551	*errorcodeptr = ERR38;
	4552	goto FAILED;
	4553	}
	4554	*code++ = n;
	4555	PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
	4556	PUT(code, LINK_SIZE, 0); /* Default length */
	4557	code += 2 * LINK_SIZE;
	4558	}
	4559	previous = NULL;
	4560	continue;
	4561
	4562
	4563	/* ------------------------------------------------------------ */
	4564	case 'P': /* Python-style named subpattern handling */
	4565	if ((++ptr) == '=' \|\| ptr == '>') /* Reference or recursion */
	4566	{
	4567	is_recurse = *ptr == '>';
	4568	terminator = ')';
	4569	goto NAMED_REF_OR_RECURSE;
	4570	}
	4571	else if (ptr != '<') / Test for Python-style definition */
	4572	{
	4573	*errorcodeptr = ERR41;
	4574	goto FAILED;
	4575	}
	4576	/* Fall through to handle (?P< as (?< is handled */
	4577
	4578
	4579	/* ------------------------------------------------------------ */
	4580	DEFINE_NAME: /* Come here from (?< handling */
	4581	case '\'':
	4582	{
	4583	terminator = (*ptr == '<')? '>' : '\'';
	4584	name = ++ptr;
	4585
	4586	while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
	4587	namelen = ptr - name;
	4588
	4589	/* In the pre-compile phase, just do a syntax check. */
	4590
	4591	if (lengthptr != NULL)
	4592	{
	4593	if (*ptr != terminator)
	4594	{
	4595	*errorcodeptr = ERR42;
	4596	goto FAILED;
	4597	}
	4598	if (cd->names_found >= MAX_NAME_COUNT)
	4599	{
	4600	*errorcodeptr = ERR49;
	4601	goto FAILED;
	4602	}
	4603	if (namelen + 3 > cd->name_entry_size)
	4604	{
	4605	cd->name_entry_size = namelen + 3;
	4606	if (namelen > MAX_NAME_SIZE)
	4607	{
	4608	*errorcodeptr = ERR48;
	4609	goto FAILED;
	4610	}
	4611	}
	4612	}
	4613
	4614	/* In the real compile, create the entry in the table */
	4615
	4616	else
	4617	{
	4618	slot = cd->name_table;
	4619	for (i = 0; i < cd->names_found; i++)
	4620	{
	4621	int crc = memcmp(name, slot+2, namelen);
	4622	if (crc == 0)
	4623	{
	4624	if (slot[2+namelen] == 0)
	4625	{
	4626	if ((options & PCRE_DUPNAMES) == 0)
	4627	{
	4628	*errorcodeptr = ERR43;
	4629	goto FAILED;
	4630	}
	4631	}
	4632	else crc = -1; /* Current name is substring */
	4633	}
	4634	if (crc < 0)
	4635	{
	4636	memmove(slot + cd->name_entry_size, slot,
	4637	(cd->names_found - i) * cd->name_entry_size);
	4638	break;
	4639	}
	4640	slot += cd->name_entry_size;
	4641	}
	4642
	4643	PUT2(slot, 0, cd->bracount + 1);
	4644	memcpy(slot + 2, name, namelen);
	4645	slot[2+namelen] = 0;
	4646	}
	4647	}
	4648
	4649	/* In both cases, count the number of names we've encountered. */
	4650
	4651	ptr++; /* Move past > or ' */
	4652	cd->names_found++;
	4653	goto NUMBERED_GROUP;
	4654
	4655
	4656	/* ------------------------------------------------------------ */
	4657	case '&': /* Perl recursion/subroutine syntax */
	4658	terminator = ')';
	4659	is_recurse = TRUE;
	4660	/* Fall through */
	4661
	4662	/* We come here from the Python syntax above that handles both
	4663	references (?P=name) and recursion (?P>name), as well as falling
	4664	through from the Perl recursion syntax (?&name). We also come here from
	4665	the Perl \k<name> or \k'name' back reference syntax and the \k{name}
	4666	.NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
	4667
	4668	NAMED_REF_OR_RECURSE:
	4669	name = ++ptr;
	4670	while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
	4671	namelen = ptr - name;
	4672
	4673	/* In the pre-compile phase, do a syntax check and set a dummy
	4674	reference number. */
	4675
	4676	if (lengthptr != NULL)
	4677	{
	4678	if (namelen == 0)
	4679	{
	4680	*errorcodeptr = ERR62;
	4681	goto FAILED;
	4682	}
	4683	if (*ptr != terminator)
	4684	{
	4685	*errorcodeptr = ERR42;
	4686	goto FAILED;
	4687	}
	4688	if (namelen > MAX_NAME_SIZE)
	4689	{
	4690	*errorcodeptr = ERR48;
	4691	goto FAILED;
	4692	}
	4693	recno = 0;
	4694	}
	4695
	4696	/* In the real compile, seek the name in the table. We check the name
	4697	first, and then check that we have reached the end of the name in the
	4698	table. That way, if the name that is longer than any in the table,
	4699	the comparison will fail without reading beyond the table entry. */
	4700
	4701	else
	4702	{
	4703	slot = cd->name_table;
	4704	for (i = 0; i < cd->names_found; i++)
	4705	{
	4706	if (strncmp((char )name, (char )slot+2, namelen) == 0 &&
	4707	slot[2+namelen] == 0)
	4708	break;
	4709	slot += cd->name_entry_size;
	4710	}
	4711
	4712	if (i < cd->names_found) /* Back reference */
	4713	{
	4714	recno = GET2(slot, 0);
	4715	}
	4716	else if ((recno = /* Forward back reference */
	4717	find_parens(ptr, cd, name, namelen,
	4718	(options & PCRE_EXTENDED) != 0)) <= 0)
	4719	{
	4720	*errorcodeptr = ERR15;
	4721	goto FAILED;
	4722	}
	4723	}
	4724
	4725	/* In both phases, we can now go to the code than handles numerical
	4726	recursion or backreferences. */
	4727
	4728	if (is_recurse) goto HANDLE_RECURSION;
	4729	else goto HANDLE_REFERENCE;
	4730
	4731
	4732	/* ------------------------------------------------------------ */
	4733	case 'R': /* Recursion */
	4734	ptr++; /* Same as (?0) */
	4735	/* Fall through */
	4736
	4737
	4738	/* ------------------------------------------------------------ */
	4739	case '-': case '+':
	4740	case '0': case '1': case '2': case '3': case '4': /* Recursion or */
	4741	case '5': case '6': case '7': case '8': case '9': /* subroutine */
	4742	{
	4743	const uschar *called;
	4744	terminator = ')';
	4745
	4746	/* Come here from the \g<...> and \g'...' code (Oniguruma
	4747	compatibility). However, the syntax has been checked to ensure that
	4748	the ... are a (signed) number, so that neither ERR63 nor ERR29 will
	4749	be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
	4750	ever be taken. */
	4751
	4752	HANDLE_NUMERICAL_RECURSION:
	4753
	4754	if ((refsign = *ptr) == '+')
	4755	{
	4756	ptr++;
	4757	if ((digitab[*ptr] & ctype_digit) == 0)
	4758	{
	4759	*errorcodeptr = ERR63;
	4760	goto FAILED;
	4761	}
	4762	}
	4763	else if (refsign == '-')
	4764	{
	4765	if ((digitab[ptr[1]] & ctype_digit) == 0)
	4766	goto OTHER_CHAR_AFTER_QUERY;
	4767	ptr++;
	4768	}
	4769
	4770	recno = 0;
	4771	while((digitab[*ptr] & ctype_digit) != 0)
	4772	recno = recno * 10 + *ptr++ - '0';
	4773
	4774	if (*ptr != terminator)
	4775	{
	4776	*errorcodeptr = ERR29;
	4777	goto FAILED;
	4778	}
	4779
	4780	if (refsign == '-')
	4781	{
	4782	if (recno == 0)
	4783	{
	4784	*errorcodeptr = ERR58;
	4785	goto FAILED;
	4786	}
	4787	recno = cd->bracount - recno + 1;
	4788	if (recno <= 0)
	4789	{
	4790	*errorcodeptr = ERR15;
	4791	goto FAILED;
	4792	}
	4793	}
	4794	else if (refsign == '+')
	4795	{
	4796	if (recno == 0)
	4797	{
	4798	*errorcodeptr = ERR58;
	4799	goto FAILED;
	4800	}
	4801	recno += cd->bracount;
	4802	}
	4803
	4804	/* Come here from code above that handles a named recursion */
	4805
	4806	HANDLE_RECURSION:
	4807
	4808	previous = code;
	4809	called = cd->start_code;
	4810
	4811	/* When we are actually compiling, find the bracket that is being
	4812	referenced. Temporarily end the regex in case it doesn't exist before
	4813	this point. If we end up with a forward reference, first check that
	4814	the bracket does occur later so we can give the error (and position)
	4815	now. Then remember this forward reference in the workspace so it can
	4816	be filled in at the end. */
	4817
	4818	if (lengthptr == NULL)
	4819	{
	4820	*code = OP_END;
	4821	if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
	4822
	4823	/* Forward reference */
	4824
	4825	if (called == NULL)
	4826	{
	4827	if (find_parens(ptr, cd, NULL, recno,
	4828	(options & PCRE_EXTENDED) != 0) < 0)
	4829	{
	4830	*errorcodeptr = ERR15;
	4831	goto FAILED;
	4832	}
	4833	called = cd->start_code + recno;
	4834	PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
	4835	}
	4836
	4837	/* If not a forward reference, and the subpattern is still open,
	4838	this is a recursive call. We check to see if this is a left
	4839	recursion that could loop for ever, and diagnose that case. */
	4840
	4841	else if (GET(called, 1) == 0 &&
	4842	could_be_empty(called, code, bcptr, utf8))
	4843	{
	4844	*errorcodeptr = ERR40;
	4845	goto FAILED;
	4846	}
	4847	}
	4848
	4849	/* Insert the recursion/subroutine item, automatically wrapped inside
	4850	"once" brackets. Set up a "previous group" length so that a
	4851	subsequent quantifier will work. */
	4852
	4853	*code = OP_ONCE;
	4854	PUT(code, 1, 2 + 2*LINK_SIZE);
	4855	code += 1 + LINK_SIZE;
	4856
	4857	*code = OP_RECURSE;
	4858	PUT(code, 1, called - cd->start_code);
	4859	code += 1 + LINK_SIZE;
	4860
	4861	*code = OP_KET;
	4862	PUT(code, 1, 2 + 2*LINK_SIZE);
	4863	code += 1 + LINK_SIZE;
	4864
	4865	length_prevgroup = 3 + 3*LINK_SIZE;
	4866	}
	4867
	4868	/* Can't determine a first byte now */
	4869
	4870	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	4871	continue;
	4872
	4873
	4874	/* ------------------------------------------------------------ */
	4875	default: /* Other characters: check option setting */
	4876	OTHER_CHAR_AFTER_QUERY:
	4877	set = unset = 0;
	4878	optset = &set;
	4879
	4880	while (ptr != ')' && ptr != ':')
	4881	{
	4882	switch (*ptr++)
	4883	{
	4884	case '-': optset = &unset; break;
	4885
	4886	case 'J': /* Record that it changed in the external options */
	4887	*optset \|= PCRE_DUPNAMES;
	4888	cd->external_flags \|= PCRE_JCHANGED;
	4889	break;
	4890
	4891	case 'i': *optset \|= PCRE_CASELESS; break;
	4892	case 'm': *optset \|= PCRE_MULTILINE; break;
	4893	case 's': *optset \|= PCRE_DOTALL; break;
	4894	case 'x': *optset \|= PCRE_EXTENDED; break;
	4895	case 'U': *optset \|= PCRE_UNGREEDY; break;
	4896	case 'X': *optset \|= PCRE_EXTRA; break;
	4897
	4898	default: *errorcodeptr = ERR12;
	4899	ptr--; /* Correct the offset */
	4900	goto FAILED;
	4901	}
	4902	}
	4903
	4904	/* Set up the changed option bits, but don't change anything yet. */
	4905
	4906	newoptions = (options \| set) & (~unset);
	4907
	4908	/* If the options ended with ')' this is not the start of a nested
	4909	group with option changes, so the options change at this level. If this
	4910	item is right at the start of the pattern, the options can be
	4911	abstracted and made external in the pre-compile phase, and ignored in
	4912	the compile phase. This can be helpful when matching -- for instance in
	4913	caseless checking of required bytes.
	4914
	4915	If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
	4916	definitely not at the start of the pattern because something has been
	4917	compiled. In the pre-compile phase, however, the code pointer can have
	4918	that value after the start, because it gets reset as code is discarded
	4919	during the pre-compile. However, this can happen only at top level - if
	4920	we are within parentheses, the starting BRA will still be present. At
	4921	any parenthesis level, the length value can be used to test if anything
	4922	has been compiled at that level. Thus, a test for both these conditions
	4923	is necessary to ensure we correctly detect the start of the pattern in
	4924	both phases.
	4925
	4926	If we are not at the pattern start, compile code to change the ims
	4927	options if this setting actually changes any of them, and reset the
	4928	greedy defaults and the case value for firstbyte and reqbyte. */
	4929
	4930	if (*ptr == ')')
	4931	{
	4932	if (code == cd->start_code + 1 + LINK_SIZE &&
	4933	(lengthptr == NULL \|\| lengthptr == 2 + 2LINK_SIZE))
	4934	{
	4935	cd->external_options = newoptions;
	4936	}
	4937	else
	4938	{
	4939	if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
	4940	{
	4941	*code++ = OP_OPT;
	4942	*code++ = newoptions & PCRE_IMS;
	4943	}
	4944	greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
	4945	greedy_non_default = greedy_default ^ 1;
	4946	req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
	4947	}
	4948
	4949	/* Change options at this level, and pass them back for use
	4950	in subsequent branches. When not at the start of the pattern, this
	4951	information is also necessary so that a resetting item can be
	4952	compiled at the end of a group (if we are in a group). */
	4953
	4954	*optionsptr = options = newoptions;
	4955	previous = NULL; /* This item can't be repeated */
	4956	continue; /* It is complete */
	4957	}
	4958
	4959	/* If the options ended with ':' we are heading into a nested group
	4960	with possible change of options. Such groups are non-capturing and are
	4961	not assertions of any kind. All we need to do is skip over the ':';
	4962	the newoptions value is handled below. */
	4963
	4964	bravalue = OP_BRA;
	4965	ptr++;
	4966	} /* End of switch for character following (? */
	4967	} /* End of (? handling */
	4968
	4969	/* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
	4970	all unadorned brackets become non-capturing and behave like (?:...)
	4971	brackets. */
	4972
	4973	else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
	4974	{
	4975	bravalue = OP_BRA;
	4976	}
	4977
	4978	/* Else we have a capturing group. */
	4979
	4980	else
	4981	{
	4982	NUMBERED_GROUP:
	4983	cd->bracount += 1;
	4984	PUT2(code, 1+LINK_SIZE, cd->bracount);
	4985	skipbytes = 2;
	4986	}
	4987
	4988	/* Process nested bracketed regex. Assertions may not be repeated, but
	4989	other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
	4990	non-register variable in order to be able to pass its address because some
	4991	compilers complain otherwise. Pass in a new setting for the ims options if
	4992	they have changed. */
	4993
	4994	previous = (bravalue >= OP_ONCE)? code : NULL;
	4995	*code = bravalue;
	4996	tempcode = code;
	4997	tempreqvary = cd->req_varyopt; /* Save value before bracket */
	4998	length_prevgroup = 0; /* Initialize for pre-compile phase */
	4999
	5000	if (!compile_regex(
	5001	newoptions, /* The complete new option state */
	5002	options & PCRE_IMS, /* The previous ims option state */
	5003	&tempcode, /* Where to put code (updated) */
	5004	&ptr, /* Input pointer (updated) */
	5005	errorcodeptr, /* Where to put an error message */
	5006	(bravalue == OP_ASSERTBACK \|\|
	5007	bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
	5008	reset_bracount, /* True if (?\| group */
	5009	skipbytes, /* Skip over bracket number */
	5010	&subfirstbyte, /* For possible first char */
	5011	&subreqbyte, /* For possible last char */
	5012	bcptr, /* Current branch chain */
	5013	cd, /* Tables block */
	5014	(lengthptr == NULL)? NULL : /* Actual compile phase */
	5015	&length_prevgroup /* Pre-compile phase */
	5016	))
	5017	goto FAILED;
	5018
	5019	/* At the end of compiling, code is still pointing to the start of the
	5020	group, while tempcode has been updated to point past the end of the group
	5021	and any option resetting that may follow it. The pattern pointer (ptr)
	5022	is on the bracket. */
	5023
	5024	/* If this is a conditional bracket, check that there are no more than
	5025	two branches in the group, or just one if it's a DEFINE group. We do this
	5026	in the real compile phase, not in the pre-pass, where the whole group may
	5027	not be available. */
	5028
	5029	if (bravalue == OP_COND && lengthptr == NULL)
	5030	{
	5031	uschar *tc = code;
	5032	int condcount = 0;
	5033
	5034	do {
	5035	condcount++;
	5036	tc += GET(tc,1);
	5037	}
	5038	while (*tc != OP_KET);
	5039
	5040	/* A DEFINE group is never obeyed inline (the "condition" is always
	5041	false). It must have only one branch. */
	5042
	5043	if (code[LINK_SIZE+1] == OP_DEF)
	5044	{
	5045	if (condcount > 1)
	5046	{
	5047	*errorcodeptr = ERR54;
	5048	goto FAILED;
	5049	}
	5050	bravalue = OP_DEF; /* Just a flag to suppress char handling below */
	5051	}
	5052
	5053	/* A "normal" conditional group. If there is just one branch, we must not
	5054	make use of its firstbyte or reqbyte, because this is equivalent to an
	5055	empty second branch. */
	5056
	5057	else
	5058	{
	5059	if (condcount > 2)
	5060	{
	5061	*errorcodeptr = ERR27;
	5062	goto FAILED;
	5063	}
	5064	if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
	5065	}
	5066	}
	5067
	5068	/* Error if hit end of pattern */
	5069
	5070	if (*ptr != ')')
	5071	{
	5072	*errorcodeptr = ERR14;
	5073	goto FAILED;
	5074	}
	5075
	5076	/* In the pre-compile phase, update the length by the length of the group,
	5077	less the brackets at either end. Then reduce the compiled code to just a
	5078	set of non-capturing brackets so that it doesn't use much memory if it is
	5079	duplicated by a quantifier.*/
	5080
	5081	if (lengthptr != NULL)
	5082	{
	5083	if (OFLOW_MAX - lengthptr < length_prevgroup - 2 - 2LINK_SIZE)
	5084	{
	5085	*errorcodeptr = ERR20;
	5086	goto FAILED;
	5087	}
	5088	lengthptr += length_prevgroup - 2 - 2LINK_SIZE;
	5089	*code++ = OP_BRA;
	5090	PUTINC(code, 0, 1 + LINK_SIZE);
	5091	*code++ = OP_KET;
	5092	PUTINC(code, 0, 1 + LINK_SIZE);
	5093	break; /* No need to waste time with special character handling */
	5094	}
	5095
	5096	/* Otherwise update the main code pointer to the end of the group. */
	5097
	5098	code = tempcode;
	5099
	5100	/* For a DEFINE group, required and first character settings are not
	5101	relevant. */
	5102
	5103	if (bravalue == OP_DEF) break;
	5104
	5105	/* Handle updating of the required and first characters for other types of
	5106	group. Update for normal brackets of all kinds, and conditions with two
	5107	branches (see code above). If the bracket is followed by a quantifier with
	5108	zero repeat, we have to back off. Hence the definition of zeroreqbyte and
	5109	zerofirstbyte outside the main loop so that they can be accessed for the
	5110	back off. */
	5111
	5112	zeroreqbyte = reqbyte;
	5113	zerofirstbyte = firstbyte;
	5114	groupsetfirstbyte = FALSE;
	5115
	5116	if (bravalue >= OP_ONCE)
	5117	{
	5118	/* If we have not yet set a firstbyte in this branch, take it from the
	5119	subpattern, remembering that it was set here so that a repeat of more
	5120	than one can replicate it as reqbyte if necessary. If the subpattern has
	5121	no firstbyte, set "none" for the whole branch. In both cases, a zero
	5122	repeat forces firstbyte to "none". */
	5123
	5124	if (firstbyte == REQ_UNSET)
	5125	{
	5126	if (subfirstbyte >= 0)
	5127	{
	5128	firstbyte = subfirstbyte;
	5129	groupsetfirstbyte = TRUE;
	5130	}
	5131	else firstbyte = REQ_NONE;
	5132	zerofirstbyte = REQ_NONE;
	5133	}
	5134
	5135	/* If firstbyte was previously set, convert the subpattern's firstbyte
	5136	into reqbyte if there wasn't one, using the vary flag that was in
	5137	existence beforehand. */
	5138
	5139	else if (subfirstbyte >= 0 && subreqbyte < 0)
	5140	subreqbyte = subfirstbyte \| tempreqvary;
	5141
	5142	/* If the subpattern set a required byte (or set a first byte that isn't
	5143	really the first byte - see above), set it. */
	5144
	5145	if (subreqbyte >= 0) reqbyte = subreqbyte;
	5146	}
	5147
	5148	/* For a forward assertion, we take the reqbyte, if set. This can be
	5149	helpful if the pattern that follows the assertion doesn't set a different
	5150	char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
	5151	for an assertion, however because it leads to incorrect effect for patterns
	5152	such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
	5153	of a firstbyte. This is overcome by a scan at the end if there's no
	5154	firstbyte, looking for an asserted first char. */
	5155
	5156	else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
	5157	break; /* End of processing '(' */
	5158
	5159
	5160	/* ===================================================================*/
	5161	/* Handle metasequences introduced by \. For ones like \d, the ESC_ values
	5162	are arranged to be the negation of the corresponding OP_values. For the
	5163	back references, the values are ESC_REF plus the reference number. Only
	5164	back references and those types that consume a character may be repeated.
	5165	We can test for values between ESC_b and ESC_Z for the latter; this may
	5166	have to change if any new ones are ever created. */
	5167
	5168	case '\\':
	5169	tempptr = ptr;
	5170	c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
	5171	if (*errorcodeptr != 0) goto FAILED;
	5172
	5173	if (c < 0)
	5174	{
	5175	if (-c == ESC_Q) /* Handle start of quoted string */
	5176	{
	5177	if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
	5178	else inescq = TRUE;
	5179	continue;
	5180	}
	5181
	5182	if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
	5183
	5184	/* For metasequences that actually match a character, we disable the
	5185	setting of a first character if it hasn't already been set. */
	5186
	5187	if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
	5188	firstbyte = REQ_NONE;
	5189
	5190	/* Set values to reset to if this is followed by a zero repeat. */
	5191
	5192	zerofirstbyte = firstbyte;
	5193	zeroreqbyte = reqbyte;
	5194
	5195	/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
	5196	is a subroutine call by number (Oniguruma syntax). In fact, the value
	5197	-ESC_g is returned only for these cases. So we don't need to check for <
	5198	or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
	5199	-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
	5200	that is a synonym for a named back reference). */
	5201
	5202	if (-c == ESC_g)
	5203	{
	5204	const uschar *p;
	5205	save_hwm = cd->hwm; /* Normally this is set when '(' is read */
	5206	terminator = (*(++ptr) == '<')? '>' : '\'';
	5207
	5208	/* These two statements stop the compiler for warning about possibly
	5209	unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
	5210	fact, because we actually check for a number below, the paths that
	5211	would actually be in error are never taken. */
	5212
	5213	skipbytes = 0;
	5214	reset_bracount = FALSE;
	5215
	5216	/* Test for a name */
	5217
	5218	if (ptr[1] != '+' && ptr[1] != '-')
	5219	{
	5220	BOOL isnumber = TRUE;
	5221	for (p = ptr + 1; p != 0 && p != terminator; p++)
	5222	{
	5223	if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
	5224	if ((cd->ctypes[*p] & ctype_word) == 0) break;
	5225	}
	5226	if (*p != terminator)
	5227	{
	5228	*errorcodeptr = ERR57;
	5229	break;
	5230	}
	5231	if (isnumber)
	5232	{
	5233	ptr++;
	5234	goto HANDLE_NUMERICAL_RECURSION;
	5235	}
	5236	is_recurse = TRUE;
	5237	goto NAMED_REF_OR_RECURSE;
	5238	}
	5239
	5240	/* Test a signed number in angle brackets or quotes. */
	5241
	5242	p = ptr + 2;
	5243	while ((digitab[*p] & ctype_digit) != 0) p++;
	5244	if (*p != terminator)
	5245	{
	5246	*errorcodeptr = ERR57;
	5247	break;
	5248	}
	5249	ptr++;
	5250	goto HANDLE_NUMERICAL_RECURSION;
	5251	}
	5252
	5253	/* \k<name> or \k'name' is a back reference by name (Perl syntax).
	5254	We also support \k{name} (.NET syntax) */
	5255
	5256	if (-c == ESC_k && (ptr[1] == '<' \|\| ptr[1] == '\'' \|\| ptr[1] == '{'))
	5257	{
	5258	is_recurse = FALSE;
	5259	terminator = ((++ptr) == '<')? '>' : (ptr == '\'')? '\'' : '}';
	5260	goto NAMED_REF_OR_RECURSE;
	5261	}
	5262
	5263	/* Back references are handled specially; must disable firstbyte if
	5264	not set to cope with cases like (?=(\w+))\1: which would otherwise set
	5265	':' later. */
	5266
	5267	if (-c >= ESC_REF)
	5268	{
	5269	recno = -c - ESC_REF;
	5270
	5271	HANDLE_REFERENCE: /* Come here from named backref handling */
	5272	if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
	5273	previous = code;
	5274	*code++ = OP_REF;
	5275	PUT2INC(code, 0, recno);
	5276	cd->backref_map \|= (recno < 32)? (1 << recno) : 1;
	5277	if (recno > cd->top_backref) cd->top_backref = recno;
	5278	}
	5279
	5280	/* So are Unicode property matches, if supported. */
	5281
	5282	#ifdef SUPPORT_UCP
	5283	else if (-c == ESC_P \|\| -c == ESC_p)
	5284	{
	5285	BOOL negated;
	5286	int pdata;
	5287	int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
	5288	if (ptype < 0) goto FAILED;
	5289	previous = code;
	5290	*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
	5291	*code++ = ptype;
	5292	*code++ = pdata;
	5293	}
	5294	#else
	5295
	5296	/* If Unicode properties are not supported, \X, \P, and \p are not
	5297	allowed. */
	5298
	5299	else if (-c == ESC_X \|\| -c == ESC_P \|\| -c == ESC_p)
	5300	{
	5301	*errorcodeptr = ERR45;
	5302	goto FAILED;
	5303	}
	5304	#endif
	5305
	5306	/* For the rest (including \X when Unicode properties are supported), we
	5307	can obtain the OP value by negating the escape value. */
	5308
	5309	else
	5310	{
	5311	previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
	5312	*code++ = -c;
	5313	}
	5314	continue;
	5315	}
	5316
	5317	/* We have a data character whose value is in c. In UTF-8 mode it may have
	5318	a value > 127. We set its representation in the length/buffer, and then
	5319	handle it as a data character. */
	5320
	5321	#ifdef SUPPORT_UTF8
	5322	if (utf8 && c > 127)
	5323	mclength = _pcre_ord2utf8(c, mcbuffer);
	5324	else
	5325	#endif
	5326
	5327	{
	5328	mcbuffer[0] = c;
	5329	mclength = 1;
	5330	}
	5331	goto ONE_CHAR;
	5332
	5333
	5334	/* ===================================================================*/
	5335	/* Handle a literal character. It is guaranteed not to be whitespace or #
	5336	when the extended flag is set. If we are in UTF-8 mode, it may be a
	5337	multi-byte literal character. */
	5338
	5339	default:
	5340	NORMAL_CHAR:
	5341	mclength = 1;
	5342	mcbuffer[0] = c;
	5343
	5344	#ifdef SUPPORT_UTF8
	5345	if (utf8 && c >= 0xc0)
	5346	{
	5347	while ((ptr[1] & 0xc0) == 0x80)
	5348	mcbuffer[mclength++] = *(++ptr);
	5349	}
	5350	#endif
	5351
	5352	/* At this point we have the character's bytes in mcbuffer, and the length
	5353	in mclength. When not in UTF-8 mode, the length is always 1. */
	5354
	5355	ONE_CHAR:
	5356	previous = code;
	5357	*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
	5358	for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
	5359
	5360	/* Remember if \r or \n were seen */
	5361
	5362	if (mcbuffer[0] == '\r' \|\| mcbuffer[0] == '\n')
	5363	cd->external_flags \|= PCRE_HASCRORLF;
	5364
	5365	/* Set the first and required bytes appropriately. If no previous first
	5366	byte, set it from this character, but revert to none on a zero repeat.
	5367	Otherwise, leave the firstbyte value alone, and don't change it on a zero
	5368	repeat. */
	5369
	5370	if (firstbyte == REQ_UNSET)
	5371	{
	5372	zerofirstbyte = REQ_NONE;
	5373	zeroreqbyte = reqbyte;
	5374
	5375	/* If the character is more than one byte long, we can set firstbyte
	5376	only if it is not to be matched caselessly. */
	5377
	5378	if (mclength == 1 \|\| req_caseopt == 0)
	5379	{
	5380	firstbyte = mcbuffer[0] \| req_caseopt;
	5381	if (mclength != 1) reqbyte = code[-1] \| cd->req_varyopt;
	5382	}
	5383	else firstbyte = reqbyte = REQ_NONE;
	5384	}
	5385
	5386	/* firstbyte was previously set; we can set reqbyte only the length is
	5387	1 or the matching is caseful. */
	5388
	5389	else
	5390	{
	5391	zerofirstbyte = firstbyte;
	5392	zeroreqbyte = reqbyte;
	5393	if (mclength == 1 \|\| req_caseopt == 0)
	5394	reqbyte = code[-1] \| req_caseopt \| cd->req_varyopt;
	5395	}
	5396
	5397	break; /* End of literal character handling */
	5398	}
	5399	} /* end of big loop */
	5400
	5401
	5402	/* Control never reaches here by falling through, only by a goto for all the
	5403	error states. Pass back the position in the pattern so that it can be displayed
	5404	to the user for diagnosing the error. */
	5405
	5406	FAILED:
	5407	*ptrptr = ptr;
	5408	return FALSE;
	5409	}
	5410
	5411
	5412
	5413
	5414	/*************************************************
	5415	* Compile sequence of alternatives *
	5416	*************************************************/
	5417
	5418	/* On entry, ptr is pointing past the bracket character, but on return it
	5419	points to the closing bracket, or vertical bar, or end of string. The code
	5420	variable is pointing at the byte into which the BRA operator has been stored.
	5421	If the ims options are changed at the start (for a (?ims: group) or during any
	5422	branch, we need to insert an OP_OPT item at the start of every following branch
	5423	to ensure they get set correctly at run time, and also pass the new options
	5424	into every subsequent branch compile.
	5425
	5426	This function is used during the pre-compile phase when we are trying to find
	5427	out the amount of memory needed, as well as during the real compile phase. The
	5428	value of lengthptr distinguishes the two phases.
	5429
	5430	Arguments:
	5431	options option bits, including any changes for this subpattern
	5432	oldims previous settings of ims option bits
	5433	codeptr -> the address of the current code pointer
	5434	ptrptr -> the address of the current pattern pointer
	5435	errorcodeptr -> pointer to error code variable
	5436	lookbehind TRUE if this is a lookbehind assertion
	5437	reset_bracount TRUE to reset the count for each branch
	5438	skipbytes skip this many bytes at start (for brackets and OP_COND)
	5439	firstbyteptr place to put the first required character, or a negative number
	5440	reqbyteptr place to put the last required character, or a negative number
	5441	bcptr pointer to the chain of currently open branches
	5442	cd points to the data block with tables pointers etc.
	5443	lengthptr NULL during the real compile phase
	5444	points to length accumulator during pre-compile phase
	5445
	5446	Returns: TRUE on success
	5447	*/
	5448
	5449	static BOOL
	5450	compile_regex(int options, int oldims, uschar codeptr, const uschar ptrptr,
	5451	int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
	5452	int firstbyteptr, int reqbyteptr, branch_chain bcptr, compile_data cd,
	5453	int *lengthptr)
	5454	{
	5455	const uschar ptr = ptrptr;
	5456	uschar code = codeptr;
	5457	uschar *last_branch = code;
	5458	uschar *start_bracket = code;
	5459	uschar *reverse_count = NULL;
	5460	int firstbyte, reqbyte;
	5461	int branchfirstbyte, branchreqbyte;
	5462	int length;
	5463	int orig_bracount;
	5464	int max_bracount;
	5465	branch_chain bc;
	5466
	5467	bc.outer = bcptr;
	5468	bc.current = code;
	5469
	5470	firstbyte = reqbyte = REQ_UNSET;
	5471
	5472	/* Accumulate the length for use in the pre-compile phase. Start with the
	5473	length of the BRA and KET and any extra bytes that are required at the
	5474	beginning. We accumulate in a local variable to save frequent testing of
	5475	lenthptr for NULL. We cannot do this by looking at the value of code at the
	5476	start and end of each alternative, because compiled items are discarded during
	5477	the pre-compile phase so that the work space is not exceeded. */
	5478
	5479	length = 2 + 2*LINK_SIZE + skipbytes;
	5480
	5481	/* WARNING: If the above line is changed for any reason, you must also change
	5482	the code that abstracts option settings at the start of the pattern and makes
	5483	them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
	5484	pre-compile phase to find out whether anything has yet been compiled or not. */
	5485
	5486	/* Offset is set zero to mark that this bracket is still open */
	5487
	5488	PUT(code, 1, 0);
	5489	code += 1 + LINK_SIZE + skipbytes;
	5490
	5491	/* Loop for each alternative branch */
	5492
	5493	orig_bracount = max_bracount = cd->bracount;
	5494	for (;;)
	5495	{
	5496	/* For a (?\| group, reset the capturing bracket count so that each branch
	5497	uses the same numbers. */
	5498
	5499	if (reset_bracount) cd->bracount = orig_bracount;
	5500
	5501	/* Handle a change of ims options at the start of the branch */
	5502
	5503	if ((options & PCRE_IMS) != oldims)
	5504	{
	5505	*code++ = OP_OPT;
	5506	*code++ = options & PCRE_IMS;
	5507	length += 2;
	5508	}
	5509
	5510	/* Set up dummy OP_REVERSE if lookbehind assertion */
	5511
	5512	if (lookbehind)
	5513	{
	5514	*code++ = OP_REVERSE;
	5515	reverse_count = code;
	5516	PUTINC(code, 0, 0);
	5517	length += 1 + LINK_SIZE;
	5518	}
	5519
	5520	/* Now compile the branch; in the pre-compile phase its length gets added
	5521	into the length. */
	5522
	5523	if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
	5524	&branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
	5525	{
	5526	*ptrptr = ptr;
	5527	return FALSE;
	5528	}
	5529
	5530	/* Keep the highest bracket count in case (?\| was used and some branch
	5531	has fewer than the rest. */
	5532
	5533	if (cd->bracount > max_bracount) max_bracount = cd->bracount;
	5534
	5535	/* In the real compile phase, there is some post-processing to be done. */
	5536
	5537	if (lengthptr == NULL)
	5538	{
	5539	/* If this is the first branch, the firstbyte and reqbyte values for the
	5540	branch become the values for the regex. */
	5541
	5542	if (*last_branch != OP_ALT)
	5543	{
	5544	firstbyte = branchfirstbyte;
	5545	reqbyte = branchreqbyte;
	5546	}
	5547
	5548	/* If this is not the first branch, the first char and reqbyte have to
	5549	match the values from all the previous branches, except that if the
	5550	previous value for reqbyte didn't have REQ_VARY set, it can still match,
	5551	and we set REQ_VARY for the regex. */
	5552
	5553	else
	5554	{
	5555	/* If we previously had a firstbyte, but it doesn't match the new branch,
	5556	we have to abandon the firstbyte for the regex, but if there was
	5557	previously no reqbyte, it takes on the value of the old firstbyte. */
	5558
	5559	if (firstbyte >= 0 && firstbyte != branchfirstbyte)
	5560	{
	5561	if (reqbyte < 0) reqbyte = firstbyte;
	5562	firstbyte = REQ_NONE;
	5563	}
	5564
	5565	/* If we (now or from before) have no firstbyte, a firstbyte from the
	5566	branch becomes a reqbyte if there isn't a branch reqbyte. */
	5567
	5568	if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
	5569	branchreqbyte = branchfirstbyte;
	5570
	5571	/* Now ensure that the reqbytes match */
	5572
	5573	if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
	5574	reqbyte = REQ_NONE;
	5575	else reqbyte \|= branchreqbyte; /* To "or" REQ_VARY */
	5576	}
	5577
	5578	/* If lookbehind, check that this branch matches a fixed-length string, and
	5579	put the length into the OP_REVERSE item. Temporarily mark the end of the
	5580	branch with OP_END. */
	5581
	5582	if (lookbehind)
	5583	{
	5584	int fixed_length;
	5585	*code = OP_END;
	5586	fixed_length = find_fixedlength(last_branch, options);
	5587	DPRINTF(("fixed length = %d\n", fixed_length));
	5588	if (fixed_length < 0)
	5589	{
	5590	*errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
	5591	*ptrptr = ptr;
	5592	return FALSE;
	5593	}
	5594	PUT(reverse_count, 0, fixed_length);
	5595	}
	5596	}
	5597
	5598	/* Reached end of expression, either ')' or end of pattern. In the real
	5599	compile phase, go back through the alternative branches and reverse the chain
	5600	of offsets, with the field in the BRA item now becoming an offset to the
	5601	first alternative. If there are no alternatives, it points to the end of the
	5602	group. The length in the terminating ket is always the length of the whole
	5603	bracketed item. If any of the ims options were changed inside the group,
	5604	compile a resetting op-code following, except at the very end of the pattern.
	5605	Return leaving the pointer at the terminating char. */
	5606
	5607	if (*ptr != '\|')
	5608	{
	5609	if (lengthptr == NULL)
	5610	{
	5611	int branch_length = code - last_branch;
	5612	do
	5613	{
	5614	int prev_length = GET(last_branch, 1);
	5615	PUT(last_branch, 1, branch_length);
	5616	branch_length = prev_length;
	5617	last_branch -= branch_length;
	5618	}
	5619	while (branch_length > 0);
	5620	}
	5621
	5622	/* Fill in the ket */
	5623
	5624	*code = OP_KET;
	5625	PUT(code, 1, code - start_bracket);
	5626	code += 1 + LINK_SIZE;
	5627
	5628	/* Resetting option if needed */
	5629
	5630	if ((options & PCRE_IMS) != oldims && *ptr == ')')
	5631	{
	5632	*code++ = OP_OPT;
	5633	*code++ = oldims;
	5634	length += 2;
	5635	}
	5636
	5637	/* Retain the highest bracket number, in case resetting was used. */
	5638
	5639	cd->bracount = max_bracount;
	5640
	5641	/* Set values to pass back */
	5642
	5643	*codeptr = code;
	5644	*ptrptr = ptr;
	5645	*firstbyteptr = firstbyte;
	5646	*reqbyteptr = reqbyte;
	5647	if (lengthptr != NULL)
	5648	{
	5649	if (OFLOW_MAX - *lengthptr < length)
	5650	{
	5651	*errorcodeptr = ERR20;
	5652	return FALSE;
	5653	}
	5654	*lengthptr += length;
	5655	}
	5656	return TRUE;
	5657	}
	5658
	5659	/* Another branch follows. In the pre-compile phase, we can move the code
	5660	pointer back to where it was for the start of the first branch. (That is,
	5661	pretend that each branch is the only one.)
	5662
	5663	In the real compile phase, insert an ALT node. Its length field points back
	5664	to the previous branch while the bracket remains open. At the end the chain
	5665	is reversed. It's done like this so that the start of the bracket has a
	5666	zero offset until it is closed, making it possible to detect recursion. */
	5667
	5668	if (lengthptr != NULL)
	5669	{
	5670	code = *codeptr + 1 + LINK_SIZE + skipbytes;
	5671	length += 1 + LINK_SIZE;
	5672	}
	5673	else
	5674	{
	5675	*code = OP_ALT;
	5676	PUT(code, 1, code - last_branch);
	5677	bc.current = last_branch = code;
	5678	code += 1 + LINK_SIZE;
	5679	}
	5680
	5681	ptr++;
	5682	}
	5683	/* Control never reaches here */
	5684	}
	5685
	5686
	5687
	5688
	5689	/*************************************************
	5690	* Check for anchored expression *
	5691	*************************************************/
	5692
	5693	/* Try to find out if this is an anchored regular expression. Consider each
	5694	alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
	5695	all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
	5696	it's anchored. However, if this is a multiline pattern, then only OP_SOD
	5697	counts, since OP_CIRC can match in the middle.
	5698
	5699	We can also consider a regex to be anchored if OP_SOM starts all its branches.
	5700	This is the code for \G, which means "match at start of match position, taking
	5701	into account the match offset".
	5702
	5703	A branch is also implicitly anchored if it starts with .* and DOTALL is set,
	5704	because that will try the rest of the pattern at all possible matching points,
	5705	so there is no point trying again.... er ....
	5706
	5707	.... except when the .* appears inside capturing parentheses, and there is a
	5708	subsequent back reference to those parentheses. We haven't enough information
	5709	to catch that case precisely.
	5710
	5711	At first, the best we could do was to detect when .* was in capturing brackets
	5712	and the highest back reference was greater than or equal to that level.
	5713	However, by keeping a bitmap of the first 31 back references, we can catch some
	5714	of the more common cases more precisely.
	5715
	5716	Arguments:
	5717	code points to start of expression (the bracket)
	5718	options points to the options setting
	5719	bracket_map a bitmap of which brackets we are inside while testing; this
	5720	handles up to substring 31; after that we just have to take
	5721	the less precise approach
	5722	backref_map the back reference bitmap
	5723
	5724	Returns: TRUE or FALSE
	5725	*/
	5726
	5727	static BOOL
	5728	is_anchored(register const uschar code, int options, unsigned int bracket_map,
	5729	unsigned int backref_map)
	5730	{
	5731	do {
	5732	const uschar scode = first_significant_code(code + _pcre_OP_lengths[code],
	5733	options, PCRE_MULTILINE, FALSE);
	5734	register int op = *scode;
	5735
	5736	/* Non-capturing brackets */
	5737
	5738	if (op == OP_BRA)
	5739	{
	5740	if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
	5741	}
	5742
	5743	/* Capturing brackets */
	5744
	5745	else if (op == OP_CBRA)
	5746	{
	5747	int n = GET2(scode, 1+LINK_SIZE);
	5748	int new_map = bracket_map \| ((n < 32)? (1 << n) : 1);
	5749	if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
	5750	}
	5751
	5752	/* Other brackets */
	5753
	5754	else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_COND)
	5755	{
	5756	if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
	5757	}
	5758
	5759	/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
	5760	it isn't in brackets that are or may be referenced. */
	5761
	5762	else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\|
	5763	op == OP_TYPEPOSSTAR))
	5764	{
	5765	if (scode[1] != OP_ALLANY \|\| (bracket_map & backref_map) != 0)
	5766	return FALSE;
	5767	}
	5768
	5769	/* Check for explicit anchoring */
	5770
	5771	else if (op != OP_SOD && op != OP_SOM &&
	5772	((*options & PCRE_MULTILINE) != 0 \|\| op != OP_CIRC))
	5773	return FALSE;
	5774	code += GET(code, 1);
	5775	}
	5776	while (code == OP_ALT); / Loop for each alternative */
	5777	return TRUE;
	5778	}
	5779
	5780
	5781
	5782	/*************************************************
	5783	* Check for starting with ^ or .* *
	5784	*************************************************/
	5785
	5786	/* This is called to find out if every branch starts with ^ or .* so that
	5787	"first char" processing can be done to speed things up in multiline
	5788	matching and for non-DOTALL patterns that start with .* (which must start at
	5789	the beginning or after \n). As in the case of is_anchored() (see above), we
	5790	have to take account of back references to capturing brackets that contain .*
	5791	because in that case we can't make the assumption.
	5792
	5793	Arguments:
	5794	code points to start of expression (the bracket)
	5795	bracket_map a bitmap of which brackets we are inside while testing; this
	5796	handles up to substring 31; after that we just have to take
	5797	the less precise approach
	5798	backref_map the back reference bitmap
	5799
	5800	Returns: TRUE or FALSE
	5801	*/
	5802
	5803	static BOOL
	5804	is_startline(const uschar *code, unsigned int bracket_map,
	5805	unsigned int backref_map)
	5806	{
	5807	do {
	5808	const uschar scode = first_significant_code(code + _pcre_OP_lengths[code],
	5809	NULL, 0, FALSE);
	5810	register int op = *scode;
	5811
	5812	/* Non-capturing brackets */
	5813
	5814	if (op == OP_BRA)
	5815	{
	5816	if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
	5817	}
	5818
	5819	/* Capturing brackets */
	5820
	5821	else if (op == OP_CBRA)
	5822	{
	5823	int n = GET2(scode, 1+LINK_SIZE);
	5824	int new_map = bracket_map \| ((n < 32)? (1 << n) : 1);
	5825	if (!is_startline(scode, new_map, backref_map)) return FALSE;
	5826	}
	5827
	5828	/* Other brackets */
	5829
	5830	else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_COND)
	5831	{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
	5832
	5833	/* .* means "start at start or after \n" if it isn't in brackets that
	5834	may be referenced. */
	5835
	5836	else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)
	5837	{
	5838	if (scode[1] != OP_ANY \|\| (bracket_map & backref_map) != 0) return FALSE;
	5839	}
	5840
	5841	/* Check for explicit circumflex */
	5842
	5843	else if (op != OP_CIRC) return FALSE;
	5844
	5845	/* Move on to the next alternative */
	5846
	5847	code += GET(code, 1);
	5848	}
	5849	while (code == OP_ALT); / Loop for each alternative */
	5850	return TRUE;
	5851	}
	5852
	5853
	5854
	5855	/*************************************************
	5856	* Check for asserted fixed first char *
	5857	*************************************************/
	5858
	5859	/* During compilation, the "first char" settings from forward assertions are
	5860	discarded, because they can cause conflicts with actual literals that follow.
	5861	However, if we end up without a first char setting for an unanchored pattern,
	5862	it is worth scanning the regex to see if there is an initial asserted first
	5863	char. If all branches start with the same asserted char, or with a bracket all
	5864	of whose alternatives start with the same asserted char (recurse ad lib), then
	5865	we return that char, otherwise -1.
	5866
	5867	Arguments:
	5868	code points to start of expression (the bracket)
	5869	options pointer to the options (used to check casing changes)
	5870	inassert TRUE if in an assertion
	5871
	5872	Returns: -1 or the fixed first char
	5873	*/
	5874
	5875	static int
	5876	find_firstassertedchar(const uschar code, int options, BOOL inassert)
	5877	{
	5878	register int c = -1;
	5879	do {
	5880	int d;
	5881	const uschar *scode =
	5882	first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
	5883	register int op = *scode;
	5884
	5885	switch(op)
	5886	{
	5887	default:
	5888	return -1;
	5889
	5890	case OP_BRA:
	5891	case OP_CBRA:
	5892	case OP_ASSERT:
	5893	case OP_ONCE:
	5894	case OP_COND:
	5895	if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
	5896	return -1;
	5897	if (c < 0) c = d; else if (c != d) return -1;
	5898	break;
	5899
	5900	case OP_EXACT: /* Fall through */
	5901	scode += 2;
	5902
	5903	case OP_CHAR:
	5904	case OP_CHARNC:
	5905	case OP_PLUS:
	5906	case OP_MINPLUS:
	5907	case OP_POSPLUS:
	5908	if (!inassert) return -1;
	5909	if (c < 0)
	5910	{
	5911	c = scode[1];
	5912	if ((*options & PCRE_CASELESS) != 0) c \|= REQ_CASELESS;
	5913	}
	5914	else if (c != scode[1]) return -1;
	5915	break;
	5916	}
	5917
	5918	code += GET(code, 1);
	5919	}
	5920	while (*code == OP_ALT);
	5921	return c;
	5922	}
	5923
	5924
	5925
	5926	/*************************************************
	5927	* Compile a Regular Expression *
	5928	*************************************************/
	5929
	5930	/* This function takes a string and returns a pointer to a block of store
	5931	holding a compiled version of the expression. The original API for this
	5932	function had no error code return variable; it is retained for backwards
	5933	compatibility. The new function is given a new name.
	5934
	5935	Arguments:
	5936	pattern the regular expression
	5937	options various option bits
	5938	errorcodeptr pointer to error code variable (pcre_compile2() only)
	5939	can be NULL if you don't want a code value
	5940	errorptr pointer to pointer to error text
	5941	erroroffset ptr offset in pattern where error was detected
	5942	tables pointer to character tables or NULL
	5943
	5944	Returns: pointer to compiled data block, or NULL on error,
	5945	with errorptr and erroroffset set
	5946	*/
	5947
	5948	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
	5949	pcre_compile(const char pattern, int options, const char *errorptr,
	5950	int erroroffset, const unsigned char tables)
	5951	{
	5952	return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
	5953	}
	5954
	5955
	5956	PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
	5957	pcre_compile2(const char pattern, int options, int errorcodeptr,
	5958	const char *errorptr, int erroroffset, const unsigned char *tables)
	5959	{
	5960	real_pcre *re;
	5961	int length = 1; /* For final END opcode */
	5962	int firstbyte, reqbyte, newline;
	5963	int errorcode = 0;
	5964	int skipatstart = 0;
	5965	#ifdef SUPPORT_UTF8
	5966	BOOL utf8;
	5967	#endif
	5968	size_t size;
	5969	uschar *code;
	5970	const uschar *codestart;
	5971	const uschar *ptr;
	5972	compile_data compile_block;
	5973	compile_data *cd = &compile_block;
	5974
	5975	/* This space is used for "compiling" into during the first phase, when we are
	5976	computing the amount of memory that is needed. Compiled items are thrown away
	5977	as soon as possible, so that a fairly large buffer should be sufficient for
	5978	this purpose. The same space is used in the second phase for remembering where
	5979	to fill in forward references to subpatterns. */
	5980
	5981	uschar cworkspace[COMPILE_WORK_SIZE];
	5982
	5983	/* Set this early so that early errors get offset 0. */
	5984
	5985	ptr = (const uschar *)pattern;
	5986
	5987	/* We can't pass back an error message if errorptr is NULL; I guess the best we
	5988	can do is just return NULL, but we can set a code value if there is a code
	5989	pointer. */
	5990
	5991	if (errorptr == NULL)
	5992	{
	5993	if (errorcodeptr != NULL) *errorcodeptr = 99;
	5994	return NULL;
	5995	}
	5996
	5997	*errorptr = NULL;
	5998	if (errorcodeptr != NULL) *errorcodeptr = ERR0;
	5999
	6000	/* However, we can give a message for this error */
	6001
	6002	if (erroroffset == NULL)
	6003	{
	6004	errorcode = ERR16;
	6005	goto PCRE_EARLY_ERROR_RETURN2;
	6006	}
	6007
	6008	*erroroffset = 0;
	6009
	6010	/* Can't support UTF8 unless PCRE has been compiled to include the code. */
	6011
	6012	#ifdef SUPPORT_UTF8
	6013	utf8 = (options & PCRE_UTF8) != 0;
	6014	if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
	6015	(erroroffset = _pcre_valid_utf8((uschar )pattern, -1)) >= 0)
	6016	{
	6017	errorcode = ERR44;
	6018	goto PCRE_EARLY_ERROR_RETURN2;
	6019	}
	6020	#else
	6021	if ((options & PCRE_UTF8) != 0)
	6022	{
	6023	errorcode = ERR32;
	6024	goto PCRE_EARLY_ERROR_RETURN;
	6025	}
	6026	#endif
	6027
	6028	if ((options & ~PUBLIC_OPTIONS) != 0)
	6029	{
	6030	errorcode = ERR17;
	6031	goto PCRE_EARLY_ERROR_RETURN;
	6032	}
	6033
	6034	/* Set up pointers to the individual character tables */
	6035
	6036	if (tables == NULL) tables = _pcre_default_tables;
	6037	cd->lcc = tables + lcc_offset;
	6038	cd->fcc = tables + fcc_offset;
	6039	cd->cbits = tables + cbits_offset;
	6040	cd->ctypes = tables + ctypes_offset;
	6041
	6042	/* Check for global one-time settings at the start of the pattern, and remember
	6043	the offset for later. */
	6044
	6045	while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
	6046	{
	6047	int newnl = 0;
	6048	int newbsr = 0;
	6049
	6050	if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
	6051	{ skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
	6052	else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
	6053	{ skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
	6054	else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
	6055	{ skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
	6056	else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
	6057	{ skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
	6058	else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
	6059	{ skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
	6060
	6061	else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
	6062	{ skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
	6063	else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
	6064	{ skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
	6065
	6066	if (newnl != 0)
	6067	options = (options & ~PCRE_NEWLINE_BITS) \| newnl;
	6068	else if (newbsr != 0)
	6069	options = (options & ~(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) \| newbsr;
	6070	else break;
	6071	}
	6072
	6073	/* Check validity of \R options. */
	6074
	6075	switch (options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE))
	6076	{
	6077	case 0:
	6078	case PCRE_BSR_ANYCRLF:
	6079	case PCRE_BSR_UNICODE:
	6080	break;
	6081	default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
	6082	}
	6083
	6084	/* Handle different types of newline. The three bits give seven cases. The
	6085	current code allows for fixed one- or two-byte sequences, plus "any" and
	6086	"anycrlf". */
	6087
	6088	switch (options & PCRE_NEWLINE_BITS)
	6089	{
	6090	case 0: newline = NEWLINE; break; /* Build-time default */
	6091	case PCRE_NEWLINE_CR: newline = '\r'; break;
	6092	case PCRE_NEWLINE_LF: newline = '\n'; break;
	6093	case PCRE_NEWLINE_CR+
	6094	PCRE_NEWLINE_LF: newline = ('\r' << 8) \| '\n'; break;
	6095	case PCRE_NEWLINE_ANY: newline = -1; break;
	6096	case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
	6097	default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
	6098	}
	6099
	6100	if (newline == -2)
	6101	{
	6102	cd->nltype = NLTYPE_ANYCRLF;
	6103	}
	6104	else if (newline < 0)
	6105	{
	6106	cd->nltype = NLTYPE_ANY;
	6107	}
	6108	else
	6109	{
	6110	cd->nltype = NLTYPE_FIXED;
	6111	if (newline > 255)
	6112	{
	6113	cd->nllen = 2;
	6114	cd->nl[0] = (newline >> 8) & 255;
	6115	cd->nl[1] = newline & 255;
	6116	}
	6117	else
	6118	{
	6119	cd->nllen = 1;
	6120	cd->nl[0] = newline;
	6121	}
	6122	}
	6123
	6124	/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
	6125	references to help in deciding whether (.*) can be treated as anchored or not.
	6126	*/
	6127
	6128	cd->top_backref = 0;
	6129	cd->backref_map = 0;
	6130
	6131	/* Reflect pattern for debugging output */
	6132
	6133	DPRINTF(("------------------------------------------------------------------\n"));
	6134	DPRINTF(("%s\n", pattern));
	6135
	6136	/* Pretend to compile the pattern while actually just accumulating the length
	6137	of memory required. This behaviour is triggered by passing a non-NULL final
	6138	argument to compile_regex(). We pass a block of workspace (cworkspace) for it
	6139	to compile parts of the pattern into; the compiled code is discarded when it is
	6140	no longer needed, so hopefully this workspace will never overflow, though there
	6141	is a test for its doing so. */
	6142
	6143	cd->bracount = cd->final_bracount = 0;
	6144	cd->names_found = 0;
	6145	cd->name_entry_size = 0;
	6146	cd->name_table = NULL;
	6147	cd->start_workspace = cworkspace;
	6148	cd->start_code = cworkspace;
	6149	cd->hwm = cworkspace;
	6150	cd->start_pattern = (const uschar *)pattern;
	6151	cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
	6152	cd->req_varyopt = 0;
	6153	cd->external_options = options;
	6154	cd->external_flags = 0;
	6155
	6156	/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
	6157	don't need to look at the result of the function here. The initial options have
	6158	been put into the cd block so that they can be changed if an option setting is
	6159	found within the regex right at the beginning. Bringing initial option settings
	6160	outside can help speed up starting point checks. */
	6161
	6162	ptr += skipatstart;
	6163	code = cworkspace;
	6164	*code = OP_BRA;
	6165	(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
	6166	&code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
	6167	&length);
	6168	if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
	6169
	6170	DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
	6171	cd->hwm - cworkspace));
	6172
	6173	if (length > MAX_PATTERN_SIZE)
	6174	{
	6175	errorcode = ERR20;
	6176	goto PCRE_EARLY_ERROR_RETURN;
	6177	}
	6178
	6179	/* Compute the size of data block needed and get it, either from malloc or
	6180	externally provided function. Integer overflow should no longer be possible
	6181	because nowadays we limit the maximum value of cd->names_found and
	6182	cd->name_entry_size. */
	6183
	6184	size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
	6185	re = (real_pcre *)(pcre_malloc)(size);
	6186
	6187	if (re == NULL)
	6188	{
	6189	errorcode = ERR21;
	6190	goto PCRE_EARLY_ERROR_RETURN;
	6191	}
	6192
	6193	/* Put in the magic number, and save the sizes, initial options, internal
	6194	flags, and character table pointer. NULL is used for the default character
	6195	tables. The nullpad field is at the end; it's there to help in the case when a
	6196	regex compiled on a system with 4-byte pointers is run on another with 8-byte
	6197	pointers. */
	6198
	6199	re->magic_number = MAGIC_NUMBER;
	6200	re->size = size;
	6201	re->options = cd->external_options;
	6202	re->flags = cd->external_flags;
	6203	re->dummy1 = 0;
	6204	re->first_byte = 0;
	6205	re->req_byte = 0;
	6206	re->name_table_offset = sizeof(real_pcre);
	6207	re->name_entry_size = cd->name_entry_size;
	6208	re->name_count = cd->names_found;
	6209	re->ref_count = 0;
	6210	re->tables = (tables == _pcre_default_tables)? NULL : tables;
	6211	re->nullpad = NULL;
	6212
	6213	/* The starting points of the name/number translation table and of the code are
	6214	passed around in the compile data block. The start/end pattern and initial
	6215	options are already set from the pre-compile phase, as is the name_entry_size
	6216	field. Reset the bracket count and the names_found field. Also reset the hwm
	6217	field; this time it's used for remembering forward references to subpatterns.
	6218	*/
	6219
	6220	cd->final_bracount = cd->bracount; /* Save for checking forward references */
	6221	cd->bracount = 0;
	6222	cd->names_found = 0;
	6223	cd->name_table = (uschar *)re + re->name_table_offset;
	6224	codestart = cd->name_table + re->name_entry_size * re->name_count;
	6225	cd->start_code = codestart;
	6226	cd->hwm = cworkspace;
	6227	cd->req_varyopt = 0;
	6228	cd->had_accept = FALSE;
	6229
	6230	/* Set up a starting, non-extracting bracket, then compile the expression. On
	6231	error, errorcode will be set non-zero, so we don't need to look at the result
	6232	of the function here. */
	6233
	6234	ptr = (const uschar *)pattern + skipatstart;
	6235	code = (uschar *)codestart;
	6236	*code = OP_BRA;
	6237	(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
	6238	&errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
	6239	re->top_bracket = cd->bracount;
	6240	re->top_backref = cd->top_backref;
	6241	re->flags = cd->external_flags;
	6242
	6243	if (cd->had_accept) reqbyte = -1; /* Must disable after (ACCEPT) /
	6244
	6245	/* If not reached end of pattern on success, there's an excess bracket. */
	6246
	6247	if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
	6248
	6249	/* Fill in the terminating state and check for disastrous overflow, but
	6250	if debugging, leave the test till after things are printed out. */
	6251
	6252	*code++ = OP_END;
	6253
	6254	#ifndef DEBUG
	6255	if (code - codestart > length) errorcode = ERR23;
	6256	#endif
	6257
	6258	/* Fill in any forward references that are required. */
	6259
	6260	while (errorcode == 0 && cd->hwm > cworkspace)
	6261	{
	6262	int offset, recno;
	6263	const uschar *groupptr;
	6264	cd->hwm -= LINK_SIZE;
	6265	offset = GET(cd->hwm, 0);
	6266	recno = GET(codestart, offset);
	6267	groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
	6268	if (groupptr == NULL) errorcode = ERR53;
	6269	else PUT(((uschar *)codestart), offset, groupptr - codestart);
	6270	}
	6271
	6272	/* Give an error if there's back reference to a non-existent capturing
	6273	subpattern. */
	6274
	6275	if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
	6276
	6277	/* Failed to compile, or error while post-processing */
	6278
	6279	if (errorcode != 0)
	6280	{
	6281	(pcre_free)(re);
	6282	PCRE_EARLY_ERROR_RETURN:
	6283	erroroffset = ptr - (const uschar )pattern;
	6284	PCRE_EARLY_ERROR_RETURN2:
	6285	*errorptr = find_error_text(errorcode);
	6286	if (errorcodeptr != NULL) *errorcodeptr = errorcode;
	6287	return NULL;
	6288	}
	6289
	6290	/* If the anchored option was not passed, set the flag if we can determine that
	6291	the pattern is anchored by virtue of ^ characters or \A or anything else (such
	6292	as starting with .* when DOTALL is set).
	6293
	6294	Otherwise, if we know what the first byte has to be, save it, because that
	6295	speeds up unanchored matches no end. If not, see if we can set the
	6296	PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
	6297	start with ^. and also when all branches start with .* for non-DOTALL matches.
	6298	*/
	6299
	6300	if ((re->options & PCRE_ANCHORED) == 0)
	6301	{
	6302	int temp_options = re->options; /* May get changed during these scans */
	6303	if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
	6304	re->options \|= PCRE_ANCHORED;
	6305	else
	6306	{
	6307	if (firstbyte < 0)
	6308	firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
	6309	if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
	6310	{
	6311	int ch = firstbyte & 255;
	6312	re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
	6313	cd->fcc[ch] == ch)? ch : firstbyte;
	6314	re->flags \|= PCRE_FIRSTSET;
	6315	}
	6316	else if (is_startline(codestart, 0, cd->backref_map))
	6317	re->flags \|= PCRE_STARTLINE;
	6318	}
	6319	}
	6320
	6321	/* For an anchored pattern, we use the "required byte" only if it follows a
	6322	variable length item in the regex. Remove the caseless flag for non-caseable
	6323	bytes. */
	6324
	6325	if (reqbyte >= 0 &&
	6326	((re->options & PCRE_ANCHORED) == 0 \|\| (reqbyte & REQ_VARY) != 0))
	6327	{
	6328	int ch = reqbyte & 255;
	6329	re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
	6330	cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
	6331	re->flags \|= PCRE_REQCHSET;
	6332	}
	6333
	6334	/* Print out the compiled data if debugging is enabled. This is never the
	6335	case when building a production library. */
	6336
	6337	#ifdef DEBUG
	6338
	6339	printf("Length = %d top_bracket = %d top_backref = %d\n",
	6340	length, re->top_bracket, re->top_backref);
	6341
	6342	printf("Options=%08x\n", re->options);
	6343
	6344	if ((re->flags & PCRE_FIRSTSET) != 0)
	6345	{
	6346	int ch = re->first_byte & 255;
	6347	const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
	6348	"" : " (caseless)";
	6349	if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
	6350	else printf("First char = \\x%02x%s\n", ch, caseless);
	6351	}
	6352
	6353	if ((re->flags & PCRE_REQCHSET) != 0)
	6354	{
	6355	int ch = re->req_byte & 255;
	6356	const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
	6357	"" : " (caseless)";
	6358	if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
	6359	else printf("Req char = \\x%02x%s\n", ch, caseless);
	6360	}
	6361
	6362	pcre_printint(re, stdout, TRUE);
	6363
	6364	/* This check is done here in the debugging case so that the code that
	6365	was compiled can be seen. */
	6366
	6367	if (code - codestart > length)
	6368	{
	6369	(pcre_free)(re);
	6370	*errorptr = find_error_text(ERR23);
	6371	erroroffset = ptr - (uschar )pattern;
	6372	if (errorcodeptr != NULL) *errorcodeptr = ERR23;
	6373	return NULL;
	6374	}
	6375	#endif /* DEBUG */
	6376
	6377	return (pcre *)re;
	6378	}
	6379
	6380	/* End of pcre_compile.c */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: