[80] | 1 | /************************************************* |
---|
| 2 | * Perl-Compatible Regular Expressions * |
---|
| 3 | *************************************************/ |
---|
| 4 | |
---|
| 5 | |
---|
| 6 | /* PCRE is a library of functions to support regular expressions whose syntax |
---|
| 7 | and semantics are as close as possible to those of the Perl 5 language. |
---|
| 8 | |
---|
| 9 | Written by Philip Hazel |
---|
| 10 | Copyright (c) 1997-2008 University of Cambridge |
---|
| 11 | |
---|
| 12 | ----------------------------------------------------------------------------- |
---|
| 13 | Redistribution and use in source and binary forms, with or without |
---|
| 14 | modification, are permitted provided that the following conditions are met: |
---|
| 15 | |
---|
| 16 | * Redistributions of source code must retain the above copyright notice, |
---|
| 17 | this list of conditions and the following disclaimer. |
---|
| 18 | |
---|
| 19 | * Redistributions in binary form must reproduce the above copyright |
---|
| 20 | notice, this list of conditions and the following disclaimer in the |
---|
| 21 | documentation and/or other materials provided with the distribution. |
---|
| 22 | |
---|
| 23 | * Neither the name of the University of Cambridge nor the names of its |
---|
| 24 | contributors may be used to endorse or promote products derived from |
---|
| 25 | this software without specific prior written permission. |
---|
| 26 | |
---|
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
---|
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
---|
| 37 | POSSIBILITY OF SUCH DAMAGE. |
---|
| 38 | ----------------------------------------------------------------------------- |
---|
| 39 | */ |
---|
| 40 | |
---|
| 41 | /* This header contains definitions that are shared between the different |
---|
| 42 | modules, but which are not relevant to the exported API. This includes some |
---|
| 43 | functions whose names all begin with "_pcre_". */ |
---|
| 44 | |
---|
| 45 | #ifndef PCRE_INTERNAL_H |
---|
| 46 | #define PCRE_INTERNAL_H |
---|
| 47 | |
---|
| 48 | /* Define DEBUG to get debugging output on stdout. */ |
---|
| 49 | |
---|
| 50 | #if 0 |
---|
| 51 | #define DEBUG |
---|
| 52 | #endif |
---|
| 53 | |
---|
| 54 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef |
---|
| 55 | inline, and there are *still* stupid compilers about that don't like indented |
---|
| 56 | pre-processor statements, or at least there were when I first wrote this. After |
---|
| 57 | all, it had only been about 10 years then... |
---|
| 58 | |
---|
| 59 | It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so |
---|
| 60 | be absolutely sure we get our version. */ |
---|
| 61 | |
---|
| 62 | #undef DPRINTF |
---|
| 63 | #ifdef DEBUG |
---|
| 64 | #define DPRINTF(p) printf p |
---|
| 65 | #else |
---|
| 66 | #define DPRINTF(p) /* Nothing */ |
---|
| 67 | #endif |
---|
| 68 | |
---|
| 69 | |
---|
| 70 | /* Standard C headers plus the external interface definition. The only time |
---|
| 71 | setjmp and stdarg are used is when NO_RECURSE is set. */ |
---|
| 72 | |
---|
| 73 | #include <ctype.h> |
---|
| 74 | #include <limits.h> |
---|
| 75 | #include <setjmp.h> |
---|
| 76 | #include <stdarg.h> |
---|
| 77 | #include <stddef.h> |
---|
| 78 | #include <stdio.h> |
---|
| 79 | #include <stdlib.h> |
---|
| 80 | #include <string.h> |
---|
| 81 | |
---|
| 82 | /* When compiling a DLL for Windows, the exported symbols have to be declared |
---|
| 83 | using some MS magic. I found some useful information on this web page: |
---|
| 84 | http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the |
---|
| 85 | information there, using __declspec(dllexport) without "extern" we have a |
---|
| 86 | definition; with "extern" we have a declaration. The settings here override the |
---|
| 87 | setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL, |
---|
| 88 | which is all that is needed for applications (they just import the symbols). We |
---|
| 89 | use: |
---|
| 90 | |
---|
| 91 | PCRE_EXP_DECL for declarations |
---|
| 92 | PCRE_EXP_DEFN for definitions of exported functions |
---|
| 93 | PCRE_EXP_DATA_DEFN for definitions of exported variables |
---|
| 94 | |
---|
| 95 | The reason for the two DEFN macros is that in non-Windows environments, one |
---|
| 96 | does not want to have "extern" before variable definitions because it leads to |
---|
| 97 | compiler warnings. So we distinguish between functions and variables. In |
---|
| 98 | Windows, the two should always be the same. |
---|
| 99 | |
---|
| 100 | The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest, |
---|
| 101 | which is an application, but needs to import this file in order to "peek" at |
---|
| 102 | internals, can #include pcre.h first to get an application's-eye view. |
---|
| 103 | |
---|
| 104 | In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon, |
---|
| 105 | special-purpose environments) might want to stick other stuff in front of |
---|
| 106 | exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and |
---|
| 107 | PCRE_EXP_DATA_DEFN only if they are not already set. */ |
---|
| 108 | |
---|
| 109 | #ifndef PCRE_EXP_DECL |
---|
| 110 | # ifdef _WIN32 |
---|
| 111 | # ifndef PCRE_STATIC |
---|
| 112 | # define PCRE_EXP_DECL extern __declspec(dllexport) |
---|
| 113 | # define PCRE_EXP_DEFN __declspec(dllexport) |
---|
| 114 | # define PCRE_EXP_DATA_DEFN __declspec(dllexport) |
---|
| 115 | # else |
---|
| 116 | # define PCRE_EXP_DECL extern |
---|
| 117 | # define PCRE_EXP_DEFN |
---|
| 118 | # define PCRE_EXP_DATA_DEFN |
---|
| 119 | # endif |
---|
| 120 | # else |
---|
| 121 | # ifdef __cplusplus |
---|
| 122 | # define PCRE_EXP_DECL extern "C" |
---|
| 123 | # else |
---|
| 124 | # define PCRE_EXP_DECL extern |
---|
| 125 | # endif |
---|
| 126 | # ifndef PCRE_EXP_DEFN |
---|
| 127 | # define PCRE_EXP_DEFN PCRE_EXP_DECL |
---|
| 128 | # endif |
---|
| 129 | # ifndef PCRE_EXP_DATA_DEFN |
---|
| 130 | # define PCRE_EXP_DATA_DEFN |
---|
| 131 | # endif |
---|
| 132 | # endif |
---|
| 133 | #endif |
---|
| 134 | |
---|
| 135 | /* When compiling with the MSVC compiler, it is sometimes necessary to include |
---|
| 136 | a "calling convention" before exported function names. (This is secondhand |
---|
| 137 | information; I know nothing about MSVC myself). For example, something like |
---|
| 138 | |
---|
| 139 | void __cdecl function(....) |
---|
| 140 | |
---|
| 141 | might be needed. In order so make this easy, all the exported functions have |
---|
| 142 | PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not |
---|
| 143 | set, we ensure here that it has no effect. */ |
---|
| 144 | |
---|
| 145 | #ifndef PCRE_CALL_CONVENTION |
---|
| 146 | #define PCRE_CALL_CONVENTION |
---|
| 147 | #endif |
---|
| 148 | |
---|
| 149 | /* We need to have types that specify unsigned 16-bit and 32-bit integers. We |
---|
| 150 | cannot determine these outside the compilation (e.g. by running a program as |
---|
| 151 | part of "configure") because PCRE is often cross-compiled for use on other |
---|
| 152 | systems. Instead we make use of the maximum sizes that are available at |
---|
| 153 | preprocessor time in standard C environments. */ |
---|
| 154 | |
---|
| 155 | #if USHRT_MAX == 65535 |
---|
| 156 | typedef unsigned short pcre_uint16; |
---|
| 157 | typedef short pcre_int16; |
---|
| 158 | #elif UINT_MAX == 65535 |
---|
| 159 | typedef unsigned int pcre_uint16; |
---|
| 160 | typedef int pcre_int16; |
---|
| 161 | #else |
---|
| 162 | #error Cannot determine a type for 16-bit unsigned integers |
---|
| 163 | #endif |
---|
| 164 | |
---|
| 165 | #if UINT_MAX == 4294967295 |
---|
| 166 | typedef unsigned int pcre_uint32; |
---|
| 167 | typedef int pcre_int32; |
---|
| 168 | #elif ULONG_MAX == 4294967295 |
---|
| 169 | typedef unsigned long int pcre_uint32; |
---|
| 170 | typedef long int pcre_int32; |
---|
| 171 | #else |
---|
| 172 | #error Cannot determine a type for 32-bit unsigned integers |
---|
| 173 | #endif |
---|
| 174 | |
---|
| 175 | /* All character handling must be done as unsigned characters. Otherwise there |
---|
| 176 | are problems with top-bit-set characters and functions such as isspace(). |
---|
| 177 | However, we leave the interface to the outside world as char *, because that |
---|
| 178 | should make things easier for callers. We define a short type for unsigned char |
---|
| 179 | to save lots of typing. I tried "uchar", but it causes problems on Digital |
---|
| 180 | Unix, where it is defined in sys/types, so use "uschar" instead. */ |
---|
| 181 | |
---|
| 182 | typedef unsigned char uschar; |
---|
| 183 | |
---|
| 184 | /* This is an unsigned int value that no character can ever have. UTF-8 |
---|
| 185 | characters only go up to 0x7fffffff (though Unicode doesn't go beyond |
---|
| 186 | 0x0010ffff). */ |
---|
| 187 | |
---|
| 188 | #define NOTACHAR 0xffffffff |
---|
| 189 | |
---|
| 190 | /* PCRE is able to support several different kinds of newline (CR, LF, CRLF, |
---|
| 191 | "any" and "anycrlf" at present). The following macros are used to package up |
---|
| 192 | testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various |
---|
| 193 | modules to indicate in which datablock the parameters exist, and what the |
---|
| 194 | start/end of string field names are. */ |
---|
| 195 | |
---|
| 196 | #define NLTYPE_FIXED 0 /* Newline is a fixed length string */ |
---|
| 197 | #define NLTYPE_ANY 1 /* Newline is any Unicode line ending */ |
---|
| 198 | #define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */ |
---|
| 199 | |
---|
| 200 | /* This macro checks for a newline at the given position */ |
---|
| 201 | |
---|
| 202 | #define IS_NEWLINE(p) \ |
---|
| 203 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
---|
| 204 | ((p) < NLBLOCK->PSEND && \ |
---|
| 205 | _pcre_is_newline((p), NLBLOCK->nltype, NLBLOCK->PSEND, &(NLBLOCK->nllen),\ |
---|
| 206 | utf8)) \ |
---|
| 207 | : \ |
---|
| 208 | ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \ |
---|
| 209 | (p)[0] == NLBLOCK->nl[0] && \ |
---|
| 210 | (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \ |
---|
| 211 | ) \ |
---|
| 212 | ) |
---|
| 213 | |
---|
| 214 | /* This macro checks for a newline immediately preceding the given position */ |
---|
| 215 | |
---|
| 216 | #define WAS_NEWLINE(p) \ |
---|
| 217 | ((NLBLOCK->nltype != NLTYPE_FIXED)? \ |
---|
| 218 | ((p) > NLBLOCK->PSSTART && \ |
---|
| 219 | _pcre_was_newline((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \ |
---|
| 220 | &(NLBLOCK->nllen), utf8)) \ |
---|
| 221 | : \ |
---|
| 222 | ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \ |
---|
| 223 | (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \ |
---|
| 224 | (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \ |
---|
| 225 | ) \ |
---|
| 226 | ) |
---|
| 227 | |
---|
| 228 | /* When PCRE is compiled as a C++ library, the subject pointer can be replaced |
---|
| 229 | with a custom type. This makes it possible, for example, to allow pcre_exec() |
---|
| 230 | to process subject strings that are discontinuous by using a smart pointer |
---|
| 231 | class. It must always be possible to inspect all of the subject string in |
---|
| 232 | pcre_exec() because of the way it backtracks. Two macros are required in the |
---|
| 233 | normal case, for sign-unspecified and unsigned char pointers. The former is |
---|
| 234 | used for the external interface and appears in pcre.h, which is why its name |
---|
| 235 | must begin with PCRE_. */ |
---|
| 236 | |
---|
| 237 | #ifdef CUSTOM_SUBJECT_PTR |
---|
| 238 | #define PCRE_SPTR CUSTOM_SUBJECT_PTR |
---|
| 239 | #define USPTR CUSTOM_SUBJECT_PTR |
---|
| 240 | #else |
---|
| 241 | #define PCRE_SPTR const char * |
---|
| 242 | #define USPTR const unsigned char * |
---|
| 243 | #endif |
---|
| 244 | |
---|
| 245 | |
---|
| 246 | |
---|
| 247 | /* Include the public PCRE header and the definitions of UCP character property |
---|
| 248 | values. */ |
---|
| 249 | |
---|
| 250 | #include "pcre.h" |
---|
| 251 | #include "ucp.h" |
---|
| 252 | |
---|
| 253 | /* When compiling for use with the Virtual Pascal compiler, these functions |
---|
| 254 | need to have their names changed. PCRE must be compiled with the -DVPCOMPAT |
---|
| 255 | option on the command line. */ |
---|
| 256 | |
---|
| 257 | #ifdef VPCOMPAT |
---|
| 258 | #define strlen(s) _strlen(s) |
---|
| 259 | #define strncmp(s1,s2,m) _strncmp(s1,s2,m) |
---|
| 260 | #define memcmp(s,c,n) _memcmp(s,c,n) |
---|
| 261 | #define memcpy(d,s,n) _memcpy(d,s,n) |
---|
| 262 | #define memmove(d,s,n) _memmove(d,s,n) |
---|
| 263 | #define memset(s,c,n) _memset(s,c,n) |
---|
| 264 | #else /* VPCOMPAT */ |
---|
| 265 | |
---|
| 266 | /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(), |
---|
| 267 | define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY |
---|
| 268 | is set. Otherwise, include an emulating function for those systems that have |
---|
| 269 | neither (there some non-Unix environments where this is the case). */ |
---|
| 270 | |
---|
| 271 | #ifndef HAVE_MEMMOVE |
---|
| 272 | #undef memmove /* some systems may have a macro */ |
---|
| 273 | #ifdef HAVE_BCOPY |
---|
| 274 | #define memmove(a, b, c) bcopy(b, a, c) |
---|
| 275 | #else /* HAVE_BCOPY */ |
---|
| 276 | static void * |
---|
| 277 | pcre_memmove(void *d, const void *s, size_t n) |
---|
| 278 | { |
---|
| 279 | size_t i; |
---|
| 280 | unsigned char *dest = (unsigned char *)d; |
---|
| 281 | const unsigned char *src = (const unsigned char *)s; |
---|
| 282 | if (dest > src) |
---|
| 283 | { |
---|
| 284 | dest += n; |
---|
| 285 | src += n; |
---|
| 286 | for (i = 0; i < n; ++i) *(--dest) = *(--src); |
---|
| 287 | return (void *)dest; |
---|
| 288 | } |
---|
| 289 | else |
---|
| 290 | { |
---|
| 291 | for (i = 0; i < n; ++i) *dest++ = *src++; |
---|
| 292 | return (void *)(dest - n); |
---|
| 293 | } |
---|
| 294 | } |
---|
| 295 | #define memmove(a, b, c) pcre_memmove(a, b, c) |
---|
| 296 | #endif /* not HAVE_BCOPY */ |
---|
| 297 | #endif /* not HAVE_MEMMOVE */ |
---|
| 298 | #endif /* not VPCOMPAT */ |
---|
| 299 | |
---|
| 300 | |
---|
| 301 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored |
---|
| 302 | in big-endian order) by default. These are used, for example, to link from the |
---|
| 303 | start of a subpattern to its alternatives and its end. The use of 2 bytes per |
---|
| 304 | offset limits the size of the compiled regex to around 64K, which is big enough |
---|
| 305 | for almost everybody. However, I received a request for an even bigger limit. |
---|
| 306 | For this reason, and also to make the code easier to maintain, the storing and |
---|
| 307 | loading of offsets from the byte string is now handled by the macros that are |
---|
| 308 | defined here. |
---|
| 309 | |
---|
| 310 | The macros are controlled by the value of LINK_SIZE. This defaults to 2 in |
---|
| 311 | the config.h file, but can be overridden by using -D on the command line. This |
---|
| 312 | is automated on Unix systems via the "configure" command. */ |
---|
| 313 | |
---|
| 314 | #if LINK_SIZE == 2 |
---|
| 315 | |
---|
| 316 | #define PUT(a,n,d) \ |
---|
| 317 | (a[n] = (d) >> 8), \ |
---|
| 318 | (a[(n)+1] = (d) & 255) |
---|
| 319 | |
---|
| 320 | #define GET(a,n) \ |
---|
| 321 | (((a)[n] << 8) | (a)[(n)+1]) |
---|
| 322 | |
---|
| 323 | #define MAX_PATTERN_SIZE (1 << 16) |
---|
| 324 | |
---|
| 325 | |
---|
| 326 | #elif LINK_SIZE == 3 |
---|
| 327 | |
---|
| 328 | #define PUT(a,n,d) \ |
---|
| 329 | (a[n] = (d) >> 16), \ |
---|
| 330 | (a[(n)+1] = (d) >> 8), \ |
---|
| 331 | (a[(n)+2] = (d) & 255) |
---|
| 332 | |
---|
| 333 | #define GET(a,n) \ |
---|
| 334 | (((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
---|
| 335 | |
---|
| 336 | #define MAX_PATTERN_SIZE (1 << 24) |
---|
| 337 | |
---|
| 338 | |
---|
| 339 | #elif LINK_SIZE == 4 |
---|
| 340 | |
---|
| 341 | #define PUT(a,n,d) \ |
---|
| 342 | (a[n] = (d) >> 24), \ |
---|
| 343 | (a[(n)+1] = (d) >> 16), \ |
---|
| 344 | (a[(n)+2] = (d) >> 8), \ |
---|
| 345 | (a[(n)+3] = (d) & 255) |
---|
| 346 | |
---|
| 347 | #define GET(a,n) \ |
---|
| 348 | (((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
---|
| 349 | |
---|
| 350 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
---|
| 351 | |
---|
| 352 | |
---|
| 353 | #else |
---|
| 354 | #error LINK_SIZE must be either 2, 3, or 4 |
---|
| 355 | #endif |
---|
| 356 | |
---|
| 357 | |
---|
| 358 | /* Convenience macro defined in terms of the others */ |
---|
| 359 | |
---|
| 360 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
---|
| 361 | |
---|
| 362 | |
---|
| 363 | /* PCRE uses some other 2-byte quantities that do not change when the size of |
---|
| 364 | offsets changes. There are used for repeat counts and for other things such as |
---|
| 365 | capturing parenthesis numbers in back references. */ |
---|
| 366 | |
---|
| 367 | #define PUT2(a,n,d) \ |
---|
| 368 | a[n] = (d) >> 8; \ |
---|
| 369 | a[(n)+1] = (d) & 255 |
---|
| 370 | |
---|
| 371 | #define GET2(a,n) \ |
---|
| 372 | (((a)[n] << 8) | (a)[(n)+1]) |
---|
| 373 | |
---|
| 374 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += 2 |
---|
| 375 | |
---|
| 376 | |
---|
| 377 | /* When UTF-8 encoding is being used, a character is no longer just a single |
---|
| 378 | byte. The macros for character handling generate simple sequences when used in |
---|
| 379 | byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should |
---|
| 380 | never be called in byte mode. To make sure it can never even appear when UTF-8 |
---|
| 381 | support is omitted, we don't even define it. */ |
---|
| 382 | |
---|
| 383 | #ifndef SUPPORT_UTF8 |
---|
| 384 | #define GETCHAR(c, eptr) c = *eptr; |
---|
| 385 | #define GETCHARTEST(c, eptr) c = *eptr; |
---|
| 386 | #define GETCHARINC(c, eptr) c = *eptr++; |
---|
| 387 | #define GETCHARINCTEST(c, eptr) c = *eptr++; |
---|
| 388 | #define GETCHARLEN(c, eptr, len) c = *eptr; |
---|
| 389 | /* #define BACKCHAR(eptr) */ |
---|
| 390 | |
---|
| 391 | #else /* SUPPORT_UTF8 */ |
---|
| 392 | |
---|
| 393 | /* Get the next UTF-8 character, not advancing the pointer. This is called when |
---|
| 394 | we know we are in UTF-8 mode. */ |
---|
| 395 | |
---|
| 396 | #define GETCHAR(c, eptr) \ |
---|
| 397 | c = *eptr; \ |
---|
| 398 | if (c >= 0xc0) \ |
---|
| 399 | { \ |
---|
| 400 | int gcii; \ |
---|
| 401 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
| 402 | int gcss = 6*gcaa; \ |
---|
| 403 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
| 404 | for (gcii = 1; gcii <= gcaa; gcii++) \ |
---|
| 405 | { \ |
---|
| 406 | gcss -= 6; \ |
---|
| 407 | c |= (eptr[gcii] & 0x3f) << gcss; \ |
---|
| 408 | } \ |
---|
| 409 | } |
---|
| 410 | |
---|
| 411 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
---|
| 412 | pointer. */ |
---|
| 413 | |
---|
| 414 | #define GETCHARTEST(c, eptr) \ |
---|
| 415 | c = *eptr; \ |
---|
| 416 | if (utf8 && c >= 0xc0) \ |
---|
| 417 | { \ |
---|
| 418 | int gcii; \ |
---|
| 419 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
| 420 | int gcss = 6*gcaa; \ |
---|
| 421 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
| 422 | for (gcii = 1; gcii <= gcaa; gcii++) \ |
---|
| 423 | { \ |
---|
| 424 | gcss -= 6; \ |
---|
| 425 | c |= (eptr[gcii] & 0x3f) << gcss; \ |
---|
| 426 | } \ |
---|
| 427 | } |
---|
| 428 | |
---|
| 429 | /* Get the next UTF-8 character, advancing the pointer. This is called when we |
---|
| 430 | know we are in UTF-8 mode. */ |
---|
| 431 | |
---|
| 432 | #define GETCHARINC(c, eptr) \ |
---|
| 433 | c = *eptr++; \ |
---|
| 434 | if (c >= 0xc0) \ |
---|
| 435 | { \ |
---|
| 436 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
| 437 | int gcss = 6*gcaa; \ |
---|
| 438 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
| 439 | while (gcaa-- > 0) \ |
---|
| 440 | { \ |
---|
| 441 | gcss -= 6; \ |
---|
| 442 | c |= (*eptr++ & 0x3f) << gcss; \ |
---|
| 443 | } \ |
---|
| 444 | } |
---|
| 445 | |
---|
| 446 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer */ |
---|
| 447 | |
---|
| 448 | #define GETCHARINCTEST(c, eptr) \ |
---|
| 449 | c = *eptr++; \ |
---|
| 450 | if (utf8 && c >= 0xc0) \ |
---|
| 451 | { \ |
---|
| 452 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
| 453 | int gcss = 6*gcaa; \ |
---|
| 454 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
| 455 | while (gcaa-- > 0) \ |
---|
| 456 | { \ |
---|
| 457 | gcss -= 6; \ |
---|
| 458 | c |= (*eptr++ & 0x3f) << gcss; \ |
---|
| 459 | } \ |
---|
| 460 | } |
---|
| 461 | |
---|
| 462 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
---|
| 463 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
---|
| 464 | |
---|
| 465 | #define GETCHARLEN(c, eptr, len) \ |
---|
| 466 | c = *eptr; \ |
---|
| 467 | if (c >= 0xc0) \ |
---|
| 468 | { \ |
---|
| 469 | int gcii; \ |
---|
| 470 | int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
---|
| 471 | int gcss = 6*gcaa; \ |
---|
| 472 | c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
---|
| 473 | for (gcii = 1; gcii <= gcaa; gcii++) \ |
---|
| 474 | { \ |
---|
| 475 | gcss -= 6; \ |
---|
| 476 | c |= (eptr[gcii] & 0x3f) << gcss; \ |
---|
| 477 | } \ |
---|
| 478 | len += gcaa; \ |
---|
| 479 | } |
---|
| 480 | |
---|
| 481 | /* If the pointer is not at the start of a character, move it back until |
---|
| 482 | it is. This is called only in UTF-8 mode - we don't put a test within the macro |
---|
| 483 | because almost all calls are already within a block of UTF-8 only code. */ |
---|
| 484 | |
---|
| 485 | #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- |
---|
| 486 | |
---|
| 487 | #endif |
---|
| 488 | |
---|
| 489 | |
---|
| 490 | /* In case there is no definition of offsetof() provided - though any proper |
---|
| 491 | Standard C system should have one. */ |
---|
| 492 | |
---|
| 493 | #ifndef offsetof |
---|
| 494 | #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field)) |
---|
| 495 | #endif |
---|
| 496 | |
---|
| 497 | |
---|
| 498 | /* These are the public options that can change during matching. */ |
---|
| 499 | |
---|
| 500 | #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) |
---|
| 501 | |
---|
| 502 | /* Private flags containing information about the compiled regex. They used to |
---|
| 503 | live at the top end of the options word, but that got almost full, so now they |
---|
| 504 | are in a 16-bit flags word. */ |
---|
| 505 | |
---|
| 506 | #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ |
---|
| 507 | #define PCRE_FIRSTSET 0x0002 /* first_byte is set */ |
---|
| 508 | #define PCRE_REQCHSET 0x0004 /* req_byte is set */ |
---|
| 509 | #define PCRE_STARTLINE 0x0008 /* start after \n for multiline */ |
---|
| 510 | #define PCRE_JCHANGED 0x0010 /* j option used in regex */ |
---|
| 511 | #define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */ |
---|
| 512 | |
---|
| 513 | /* Options for the "extra" block produced by pcre_study(). */ |
---|
| 514 | |
---|
| 515 | #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ |
---|
| 516 | |
---|
| 517 | /* Masks for identifying the public options that are permitted at compile |
---|
| 518 | time, run time, or study time, respectively. */ |
---|
| 519 | |
---|
| 520 | #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ |
---|
| 521 | PCRE_NEWLINE_ANYCRLF) |
---|
| 522 | |
---|
| 523 | #define PUBLIC_OPTIONS \ |
---|
| 524 | (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ |
---|
| 525 | PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ |
---|
| 526 | PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ |
---|
| 527 | PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \ |
---|
| 528 | PCRE_JAVASCRIPT_COMPAT) |
---|
| 529 | |
---|
| 530 | #define PUBLIC_EXEC_OPTIONS \ |
---|
| 531 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ |
---|
| 532 | PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) |
---|
| 533 | |
---|
| 534 | #define PUBLIC_DFA_EXEC_OPTIONS \ |
---|
| 535 | (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ |
---|
| 536 | PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ |
---|
| 537 | PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) |
---|
| 538 | |
---|
| 539 | #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ |
---|
| 540 | |
---|
| 541 | /* Magic number to provide a small check against being handed junk. Also used |
---|
| 542 | to detect whether a pattern was compiled on a host of different endianness. */ |
---|
| 543 | |
---|
| 544 | #define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */ |
---|
| 545 | |
---|
| 546 | /* Negative values for the firstchar and reqchar variables */ |
---|
| 547 | |
---|
| 548 | #define REQ_UNSET (-2) |
---|
| 549 | #define REQ_NONE (-1) |
---|
| 550 | |
---|
| 551 | /* The maximum remaining length of subject we are prepared to search for a |
---|
| 552 | req_byte match. */ |
---|
| 553 | |
---|
| 554 | #define REQ_BYTE_MAX 1000 |
---|
| 555 | |
---|
| 556 | /* Flags added to firstbyte or reqbyte; a "non-literal" item is either a |
---|
| 557 | variable-length repeat, or a anything other than literal characters. */ |
---|
| 558 | |
---|
| 559 | #define REQ_CASELESS 0x0100 /* indicates caselessness */ |
---|
| 560 | #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ |
---|
| 561 | |
---|
| 562 | /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in |
---|
| 563 | environments where these macros are defined elsewhere. */ |
---|
| 564 | |
---|
| 565 | #ifndef FALSE |
---|
| 566 | typedef int BOOL; |
---|
| 567 | |
---|
| 568 | #define FALSE 0 |
---|
| 569 | #define TRUE 1 |
---|
| 570 | #endif |
---|
| 571 | |
---|
| 572 | /* Escape items that are just an encoding of a particular data value. */ |
---|
| 573 | |
---|
| 574 | #ifndef ESC_e |
---|
| 575 | #define ESC_e 27 |
---|
| 576 | #endif |
---|
| 577 | |
---|
| 578 | #ifndef ESC_f |
---|
| 579 | #define ESC_f '\f' |
---|
| 580 | #endif |
---|
| 581 | |
---|
| 582 | #ifndef ESC_n |
---|
| 583 | #define ESC_n '\n' |
---|
| 584 | #endif |
---|
| 585 | |
---|
| 586 | #ifndef ESC_r |
---|
| 587 | #define ESC_r '\r' |
---|
| 588 | #endif |
---|
| 589 | |
---|
| 590 | /* We can't officially use ESC_t because it is a POSIX reserved identifier |
---|
| 591 | (presumably because of all the others like size_t). */ |
---|
| 592 | |
---|
| 593 | #ifndef ESC_tee |
---|
| 594 | #define ESC_tee '\t' |
---|
| 595 | #endif |
---|
| 596 | |
---|
| 597 | /* Codes for different types of Unicode property */ |
---|
| 598 | |
---|
| 599 | #define PT_ANY 0 /* Any property - matches all chars */ |
---|
| 600 | #define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ |
---|
| 601 | #define PT_GC 2 /* General characteristic (e.g. L) */ |
---|
| 602 | #define PT_PC 3 /* Particular characteristic (e.g. Lu) */ |
---|
| 603 | #define PT_SC 4 /* Script (e.g. Han) */ |
---|
| 604 | |
---|
| 605 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that |
---|
| 606 | contain UTF-8 characters with values greater than 255. */ |
---|
| 607 | |
---|
| 608 | #define XCL_NOT 0x01 /* Flag: this is a negative class */ |
---|
| 609 | #define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ |
---|
| 610 | |
---|
| 611 | #define XCL_END 0 /* Marks end of individual items */ |
---|
| 612 | #define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ |
---|
| 613 | #define XCL_RANGE 2 /* A range (two multibyte chars) follows */ |
---|
| 614 | #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ |
---|
| 615 | #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ |
---|
| 616 | |
---|
| 617 | /* These are escaped items that aren't just an encoding of a particular data |
---|
| 618 | value such as \n. They must have non-zero values, as check_escape() returns |
---|
| 619 | their negation. Also, they must appear in the same order as in the opcode |
---|
| 620 | definitions below, up to ESC_z. There's a dummy for OP_ANY because it |
---|
| 621 | corresponds to "." rather than an escape sequence, and another for OP_ALLANY |
---|
| 622 | (which is used for [^] in JavaScript compatibility mode). |
---|
| 623 | |
---|
| 624 | The final escape must be ESC_REF as subsequent values are used for |
---|
| 625 | backreferences (\1, \2, \3, etc). There are two tests in the code for an escape |
---|
| 626 | greater than ESC_b and less than ESC_Z to detect the types that may be |
---|
| 627 | repeated. These are the types that consume characters. If any new escapes are |
---|
| 628 | put in between that don't consume a character, that code will have to change. |
---|
| 629 | */ |
---|
| 630 | |
---|
| 631 | enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, |
---|
| 632 | ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, |
---|
| 633 | ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, |
---|
| 634 | ESC_REF }; |
---|
| 635 | |
---|
| 636 | |
---|
| 637 | /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to |
---|
| 638 | OP_EOD must correspond in order to the list of escapes immediately above. |
---|
| 639 | |
---|
| 640 | *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions |
---|
| 641 | that follow must also be updated to match. There is also a table called |
---|
| 642 | "coptable" in pcre_dfa_exec.c that must be updated. */ |
---|
| 643 | |
---|
| 644 | enum { |
---|
| 645 | OP_END, /* 0 End of pattern */ |
---|
| 646 | |
---|
| 647 | /* Values corresponding to backslashed metacharacters */ |
---|
| 648 | |
---|
| 649 | OP_SOD, /* 1 Start of data: \A */ |
---|
| 650 | OP_SOM, /* 2 Start of match (subject + offset): \G */ |
---|
| 651 | OP_SET_SOM, /* 3 Set start of match (\K) */ |
---|
| 652 | OP_NOT_WORD_BOUNDARY, /* 4 \B */ |
---|
| 653 | OP_WORD_BOUNDARY, /* 5 \b */ |
---|
| 654 | OP_NOT_DIGIT, /* 6 \D */ |
---|
| 655 | OP_DIGIT, /* 7 \d */ |
---|
| 656 | OP_NOT_WHITESPACE, /* 8 \S */ |
---|
| 657 | OP_WHITESPACE, /* 9 \s */ |
---|
| 658 | OP_NOT_WORDCHAR, /* 10 \W */ |
---|
| 659 | OP_WORDCHAR, /* 11 \w */ |
---|
| 660 | OP_ANY, /* 12 Match any character (subject to DOTALL) */ |
---|
| 661 | OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */ |
---|
| 662 | OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ |
---|
| 663 | OP_NOTPROP, /* 15 \P (not Unicode property) */ |
---|
| 664 | OP_PROP, /* 16 \p (Unicode property) */ |
---|
| 665 | OP_ANYNL, /* 17 \R (any newline sequence) */ |
---|
| 666 | OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ |
---|
| 667 | OP_HSPACE, /* 19 \h (horizontal whitespace) */ |
---|
| 668 | OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ |
---|
| 669 | OP_VSPACE, /* 21 \v (vertical whitespace) */ |
---|
| 670 | OP_EXTUNI, /* 22 \X (extended Unicode sequence */ |
---|
| 671 | OP_EODN, /* 23 End of data or \n at end of data: \Z. */ |
---|
| 672 | OP_EOD, /* 24 End of data: \z */ |
---|
| 673 | |
---|
| 674 | OP_OPT, /* 25 Set runtime options */ |
---|
| 675 | OP_CIRC, /* 26 Start of line - varies with multiline switch */ |
---|
| 676 | OP_DOLL, /* 27 End of line - varies with multiline switch */ |
---|
| 677 | OP_CHAR, /* 28 Match one character, casefully */ |
---|
| 678 | OP_CHARNC, /* 29 Match one character, caselessly */ |
---|
| 679 | OP_NOT, /* 30 Match one character, not the following one */ |
---|
| 680 | |
---|
| 681 | OP_STAR, /* 31 The maximizing and minimizing versions of */ |
---|
| 682 | OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */ |
---|
| 683 | OP_PLUS, /* 33 the minimizing one second. */ |
---|
| 684 | OP_MINPLUS, /* 34 This first set applies to single characters.*/ |
---|
| 685 | OP_QUERY, /* 35 */ |
---|
| 686 | OP_MINQUERY, /* 36 */ |
---|
| 687 | |
---|
| 688 | OP_UPTO, /* 37 From 0 to n matches */ |
---|
| 689 | OP_MINUPTO, /* 38 */ |
---|
| 690 | OP_EXACT, /* 39 Exactly n matches */ |
---|
| 691 | |
---|
| 692 | OP_POSSTAR, /* 40 Possessified star */ |
---|
| 693 | OP_POSPLUS, /* 41 Possessified plus */ |
---|
| 694 | OP_POSQUERY, /* 42 Posesssified query */ |
---|
| 695 | OP_POSUPTO, /* 43 Possessified upto */ |
---|
| 696 | |
---|
| 697 | OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */ |
---|
| 698 | OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */ |
---|
| 699 | OP_NOTPLUS, /* 46 the minimizing one second. They must be in */ |
---|
| 700 | OP_NOTMINPLUS, /* 47 exactly the same order as those above. */ |
---|
| 701 | OP_NOTQUERY, /* 48 This set applies to "not" single characters. */ |
---|
| 702 | OP_NOTMINQUERY, /* 49 */ |
---|
| 703 | |
---|
| 704 | OP_NOTUPTO, /* 50 From 0 to n matches */ |
---|
| 705 | OP_NOTMINUPTO, /* 51 */ |
---|
| 706 | OP_NOTEXACT, /* 52 Exactly n matches */ |
---|
| 707 | |
---|
| 708 | OP_NOTPOSSTAR, /* 53 Possessified versions */ |
---|
| 709 | OP_NOTPOSPLUS, /* 54 */ |
---|
| 710 | OP_NOTPOSQUERY, /* 55 */ |
---|
| 711 | OP_NOTPOSUPTO, /* 56 */ |
---|
| 712 | |
---|
| 713 | OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */ |
---|
| 714 | OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */ |
---|
| 715 | OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */ |
---|
| 716 | OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */ |
---|
| 717 | OP_TYPEQUERY, /* 61 This set applies to character types such as \d */ |
---|
| 718 | OP_TYPEMINQUERY, /* 62 */ |
---|
| 719 | |
---|
| 720 | OP_TYPEUPTO, /* 63 From 0 to n matches */ |
---|
| 721 | OP_TYPEMINUPTO, /* 64 */ |
---|
| 722 | OP_TYPEEXACT, /* 65 Exactly n matches */ |
---|
| 723 | |
---|
| 724 | OP_TYPEPOSSTAR, /* 66 Possessified versions */ |
---|
| 725 | OP_TYPEPOSPLUS, /* 67 */ |
---|
| 726 | OP_TYPEPOSQUERY, /* 68 */ |
---|
| 727 | OP_TYPEPOSUPTO, /* 69 */ |
---|
| 728 | |
---|
| 729 | OP_CRSTAR, /* 70 The maximizing and minimizing versions of */ |
---|
| 730 | OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */ |
---|
| 731 | OP_CRPLUS, /* 72 the minimizing one second. These codes must */ |
---|
| 732 | OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */ |
---|
| 733 | OP_CRQUERY, /* 74 These are for character classes and back refs */ |
---|
| 734 | OP_CRMINQUERY, /* 75 */ |
---|
| 735 | OP_CRRANGE, /* 76 These are different to the three sets above. */ |
---|
| 736 | OP_CRMINRANGE, /* 77 */ |
---|
| 737 | |
---|
| 738 | OP_CLASS, /* 78 Match a character class, chars < 256 only */ |
---|
| 739 | OP_NCLASS, /* 79 Same, but the bitmap was created from a negative |
---|
| 740 | class - the difference is relevant only when a UTF-8 |
---|
| 741 | character > 255 is encountered. */ |
---|
| 742 | |
---|
| 743 | OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the |
---|
| 744 | class. This does both positive and negative. */ |
---|
| 745 | |
---|
| 746 | OP_REF, /* 81 Match a back reference */ |
---|
| 747 | OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */ |
---|
| 748 | OP_CALLOUT, /* 83 Call out to external function if provided */ |
---|
| 749 | |
---|
| 750 | OP_ALT, /* 84 Start of alternation */ |
---|
| 751 | OP_KET, /* 85 End of group that doesn't have an unbounded repeat */ |
---|
| 752 | OP_KETRMAX, /* 86 These two must remain together and in this */ |
---|
| 753 | OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */ |
---|
| 754 | |
---|
| 755 | /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ |
---|
| 756 | |
---|
| 757 | OP_ASSERT, /* 88 Positive lookahead */ |
---|
| 758 | OP_ASSERT_NOT, /* 89 Negative lookahead */ |
---|
| 759 | OP_ASSERTBACK, /* 90 Positive lookbehind */ |
---|
| 760 | OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */ |
---|
| 761 | OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */ |
---|
| 762 | |
---|
| 763 | /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, |
---|
| 764 | as there's a test for >= ONCE for a subpattern that isn't an assertion. */ |
---|
| 765 | |
---|
| 766 | OP_ONCE, /* 93 Atomic group */ |
---|
| 767 | OP_BRA, /* 94 Start of non-capturing bracket */ |
---|
| 768 | OP_CBRA, /* 95 Start of capturing bracket */ |
---|
| 769 | OP_COND, /* 96 Conditional group */ |
---|
| 770 | |
---|
| 771 | /* These three must follow the previous three, in the same order. There's a |
---|
| 772 | check for >= SBRA to distinguish the two sets. */ |
---|
| 773 | |
---|
| 774 | OP_SBRA, /* 97 Start of non-capturing bracket, check empty */ |
---|
| 775 | OP_SCBRA, /* 98 Start of capturing bracket, check empty */ |
---|
| 776 | OP_SCOND, /* 99 Conditional group, check empty */ |
---|
| 777 | |
---|
| 778 | OP_CREF, /* 100 Used to hold a capture number as condition */ |
---|
| 779 | OP_RREF, /* 101 Used to hold a recursion number as condition */ |
---|
| 780 | OP_DEF, /* 102 The DEFINE condition */ |
---|
| 781 | |
---|
| 782 | OP_BRAZERO, /* 103 These two must remain together and in this */ |
---|
| 783 | OP_BRAMINZERO, /* 104 order. */ |
---|
| 784 | |
---|
| 785 | /* These are backtracking control verbs */ |
---|
| 786 | |
---|
| 787 | OP_PRUNE, /* 105 */ |
---|
| 788 | OP_SKIP, /* 106 */ |
---|
| 789 | OP_THEN, /* 107 */ |
---|
| 790 | OP_COMMIT, /* 108 */ |
---|
| 791 | |
---|
| 792 | /* These are forced failure and success verbs */ |
---|
| 793 | |
---|
| 794 | OP_FAIL, /* 109 */ |
---|
| 795 | OP_ACCEPT, /* 110 */ |
---|
| 796 | |
---|
| 797 | /* This is used to skip a subpattern with a {0} quantifier */ |
---|
| 798 | |
---|
| 799 | OP_SKIPZERO /* 111 */ |
---|
| 800 | }; |
---|
| 801 | |
---|
| 802 | |
---|
| 803 | /* This macro defines textual names for all the opcodes. These are used only |
---|
| 804 | for debugging. The macro is referenced only in pcre_printint.c. */ |
---|
| 805 | |
---|
| 806 | #define OP_NAME_LIST \ |
---|
| 807 | "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ |
---|
| 808 | "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ |
---|
| 809 | "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ |
---|
| 810 | "extuni", "\\Z", "\\z", \ |
---|
| 811 | "Opt", "^", "$", "char", "charnc", "not", \ |
---|
| 812 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
---|
| 813 | "*+","++", "?+", "{", \ |
---|
| 814 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
---|
| 815 | "*+","++", "?+", "{", \ |
---|
| 816 | "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ |
---|
| 817 | "*+","++", "?+", "{", \ |
---|
| 818 | "*", "*?", "+", "+?", "?", "??", "{", "{", \ |
---|
| 819 | "class", "nclass", "xclass", "Ref", "Recurse", "Callout", \ |
---|
| 820 | "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ |
---|
| 821 | "AssertB", "AssertB not", "Reverse", \ |
---|
| 822 | "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ |
---|
| 823 | "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ |
---|
| 824 | "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ |
---|
| 825 | "Skip zero" |
---|
| 826 | |
---|
| 827 | |
---|
| 828 | /* This macro defines the length of fixed length operations in the compiled |
---|
| 829 | regex. The lengths are used when searching for specific things, and also in the |
---|
| 830 | debugging printing of a compiled regex. We use a macro so that it can be |
---|
| 831 | defined close to the definitions of the opcodes themselves. |
---|
| 832 | |
---|
| 833 | As things have been extended, some of these are no longer fixed lenths, but are |
---|
| 834 | minima instead. For example, the length of a single-character repeat may vary |
---|
| 835 | in UTF-8 mode. The code that uses this table must know about such things. */ |
---|
| 836 | |
---|
| 837 | #define OP_LENGTHS \ |
---|
| 838 | 1, /* End */ \ |
---|
| 839 | 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ |
---|
| 840 | 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ |
---|
| 841 | 1, 1, 1, /* Any, AllAny, Anybyte */ \ |
---|
| 842 | 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ |
---|
| 843 | 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ |
---|
| 844 | 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ |
---|
| 845 | 2, /* Char - the minimum length */ \ |
---|
| 846 | 2, /* Charnc - the minimum length */ \ |
---|
| 847 | 2, /* not */ \ |
---|
| 848 | /* Positive single-char repeats ** These are */ \ |
---|
| 849 | 2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \ |
---|
| 850 | 4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \ |
---|
| 851 | 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \ |
---|
| 852 | /* Negative single-char repeats - only for chars < 256 */ \ |
---|
| 853 | 2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \ |
---|
| 854 | 4, 4, 4, /* NOT upto, minupto, exact */ \ |
---|
| 855 | 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \ |
---|
| 856 | /* Positive type repeats */ \ |
---|
| 857 | 2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \ |
---|
| 858 | 4, 4, 4, /* Type upto, minupto, exact */ \ |
---|
| 859 | 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \ |
---|
| 860 | /* Character class & ref repeats */ \ |
---|
| 861 | 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ |
---|
| 862 | 5, 5, /* CRRANGE, CRMINRANGE */ \ |
---|
| 863 | 33, /* CLASS */ \ |
---|
| 864 | 33, /* NCLASS */ \ |
---|
| 865 | 0, /* XCLASS - variable length */ \ |
---|
| 866 | 3, /* REF */ \ |
---|
| 867 | 1+LINK_SIZE, /* RECURSE */ \ |
---|
| 868 | 2+2*LINK_SIZE, /* CALLOUT */ \ |
---|
| 869 | 1+LINK_SIZE, /* Alt */ \ |
---|
| 870 | 1+LINK_SIZE, /* Ket */ \ |
---|
| 871 | 1+LINK_SIZE, /* KetRmax */ \ |
---|
| 872 | 1+LINK_SIZE, /* KetRmin */ \ |
---|
| 873 | 1+LINK_SIZE, /* Assert */ \ |
---|
| 874 | 1+LINK_SIZE, /* Assert not */ \ |
---|
| 875 | 1+LINK_SIZE, /* Assert behind */ \ |
---|
| 876 | 1+LINK_SIZE, /* Assert behind not */ \ |
---|
| 877 | 1+LINK_SIZE, /* Reverse */ \ |
---|
| 878 | 1+LINK_SIZE, /* ONCE */ \ |
---|
| 879 | 1+LINK_SIZE, /* BRA */ \ |
---|
| 880 | 3+LINK_SIZE, /* CBRA */ \ |
---|
| 881 | 1+LINK_SIZE, /* COND */ \ |
---|
| 882 | 1+LINK_SIZE, /* SBRA */ \ |
---|
| 883 | 3+LINK_SIZE, /* SCBRA */ \ |
---|
| 884 | 1+LINK_SIZE, /* SCOND */ \ |
---|
| 885 | 3, /* CREF */ \ |
---|
| 886 | 3, /* RREF */ \ |
---|
| 887 | 1, /* DEF */ \ |
---|
| 888 | 1, 1, /* BRAZERO, BRAMINZERO */ \ |
---|
| 889 | 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ |
---|
| 890 | 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */ |
---|
| 891 | |
---|
| 892 | |
---|
| 893 | /* A magic value for OP_RREF to indicate the "any recursion" condition. */ |
---|
| 894 | |
---|
| 895 | #define RREF_ANY 0xffff |
---|
| 896 | |
---|
| 897 | /* Error code numbers. They are given names so that they can more easily be |
---|
| 898 | tracked. */ |
---|
| 899 | |
---|
| 900 | enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, |
---|
| 901 | ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, |
---|
| 902 | ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, |
---|
| 903 | ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, |
---|
| 904 | ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, |
---|
| 905 | ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, |
---|
| 906 | ERR60, ERR61, ERR62, ERR63, ERR64 }; |
---|
| 907 | |
---|
| 908 | /* The real format of the start of the pcre block; the index of names and the |
---|
| 909 | code vector run on as long as necessary after the end. We store an explicit |
---|
| 910 | offset to the name table so that if a regex is compiled on one host, saved, and |
---|
| 911 | then run on another where the size of pointers is different, all might still |
---|
| 912 | be well. For the case of compiled-on-4 and run-on-8, we include an extra |
---|
| 913 | pointer that is always NULL. For future-proofing, a few dummy fields were |
---|
| 914 | originally included - even though you can never get this planning right - but |
---|
| 915 | there is only one left now. |
---|
| 916 | |
---|
| 917 | NOTE NOTE NOTE: |
---|
| 918 | Because people can now save and re-use compiled patterns, any additions to this |
---|
| 919 | structure should be made at the end, and something earlier (e.g. a new |
---|
| 920 | flag in the options or one of the dummy fields) should indicate that the new |
---|
| 921 | fields are present. Currently PCRE always sets the dummy fields to zero. |
---|
| 922 | NOTE NOTE NOTE: |
---|
| 923 | */ |
---|
| 924 | |
---|
| 925 | typedef struct real_pcre { |
---|
| 926 | pcre_uint32 magic_number; |
---|
| 927 | pcre_uint32 size; /* Total that was malloced */ |
---|
| 928 | pcre_uint32 options; /* Public options */ |
---|
| 929 | pcre_uint16 flags; /* Private flags */ |
---|
| 930 | pcre_uint16 dummy1; /* For future use */ |
---|
| 931 | pcre_uint16 top_bracket; |
---|
| 932 | pcre_uint16 top_backref; |
---|
| 933 | pcre_uint16 first_byte; |
---|
| 934 | pcre_uint16 req_byte; |
---|
| 935 | pcre_uint16 name_table_offset; /* Offset to name table that follows */ |
---|
| 936 | pcre_uint16 name_entry_size; /* Size of any name items */ |
---|
| 937 | pcre_uint16 name_count; /* Number of name items */ |
---|
| 938 | pcre_uint16 ref_count; /* Reference count */ |
---|
| 939 | |
---|
| 940 | const unsigned char *tables; /* Pointer to tables or NULL for std */ |
---|
| 941 | const unsigned char *nullpad; /* NULL padding */ |
---|
| 942 | } real_pcre; |
---|
| 943 | |
---|
| 944 | /* The format of the block used to store data from pcre_study(). The same |
---|
| 945 | remark (see NOTE above) about extending this structure applies. */ |
---|
| 946 | |
---|
| 947 | typedef struct pcre_study_data { |
---|
| 948 | pcre_uint32 size; /* Total that was malloced */ |
---|
| 949 | pcre_uint32 options; |
---|
| 950 | uschar start_bits[32]; |
---|
| 951 | } pcre_study_data; |
---|
| 952 | |
---|
| 953 | /* Structure for passing "static" information around between the functions |
---|
| 954 | doing the compiling, so that they are thread-safe. */ |
---|
| 955 | |
---|
| 956 | typedef struct compile_data { |
---|
| 957 | const uschar *lcc; /* Points to lower casing table */ |
---|
| 958 | const uschar *fcc; /* Points to case-flipping table */ |
---|
| 959 | const uschar *cbits; /* Points to character type table */ |
---|
| 960 | const uschar *ctypes; /* Points to table of type maps */ |
---|
| 961 | const uschar *start_workspace;/* The start of working space */ |
---|
| 962 | const uschar *start_code; /* The start of the compiled code */ |
---|
| 963 | const uschar *start_pattern; /* The start of the pattern */ |
---|
| 964 | const uschar *end_pattern; /* The end of the pattern */ |
---|
| 965 | uschar *hwm; /* High watermark of workspace */ |
---|
| 966 | uschar *name_table; /* The name/number table */ |
---|
| 967 | int names_found; /* Number of entries so far */ |
---|
| 968 | int name_entry_size; /* Size of each entry */ |
---|
| 969 | int bracount; /* Count of capturing parens as we compile */ |
---|
| 970 | int final_bracount; /* Saved value after first pass */ |
---|
| 971 | int top_backref; /* Maximum back reference */ |
---|
| 972 | unsigned int backref_map; /* Bitmap of low back refs */ |
---|
| 973 | int external_options; /* External (initial) options */ |
---|
| 974 | int external_flags; /* External flag bits to be set */ |
---|
| 975 | int req_varyopt; /* "After variable item" flag for reqbyte */ |
---|
| 976 | BOOL had_accept; /* (*ACCEPT) encountered */ |
---|
| 977 | int nltype; /* Newline type */ |
---|
| 978 | int nllen; /* Newline string length */ |
---|
| 979 | uschar nl[4]; /* Newline string when fixed length */ |
---|
| 980 | } compile_data; |
---|
| 981 | |
---|
| 982 | /* Structure for maintaining a chain of pointers to the currently incomplete |
---|
| 983 | branches, for testing for left recursion. */ |
---|
| 984 | |
---|
| 985 | typedef struct branch_chain { |
---|
| 986 | struct branch_chain *outer; |
---|
| 987 | uschar *current; |
---|
| 988 | } branch_chain; |
---|
| 989 | |
---|
| 990 | /* Structure for items in a linked list that represents an explicit recursive |
---|
| 991 | call within the pattern. */ |
---|
| 992 | |
---|
| 993 | typedef struct recursion_info { |
---|
| 994 | struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ |
---|
| 995 | int group_num; /* Number of group that was called */ |
---|
| 996 | const uschar *after_call; /* "Return value": points after the call in the expr */ |
---|
| 997 | USPTR save_start; /* Old value of mstart */ |
---|
| 998 | int *offset_save; /* Pointer to start of saved offsets */ |
---|
| 999 | int saved_max; /* Number of saved offsets */ |
---|
| 1000 | } recursion_info; |
---|
| 1001 | |
---|
| 1002 | /* Structure for building a chain of data for holding the values of the subject |
---|
| 1003 | pointer at the start of each subpattern, so as to detect when an empty string |
---|
| 1004 | has been matched by a subpattern - to break infinite loops. */ |
---|
| 1005 | |
---|
| 1006 | typedef struct eptrblock { |
---|
| 1007 | struct eptrblock *epb_prev; |
---|
| 1008 | USPTR epb_saved_eptr; |
---|
| 1009 | } eptrblock; |
---|
| 1010 | |
---|
| 1011 | |
---|
| 1012 | /* Structure for passing "static" information around between the functions |
---|
| 1013 | doing traditional NFA matching, so that they are thread-safe. */ |
---|
| 1014 | |
---|
| 1015 | typedef struct match_data { |
---|
| 1016 | unsigned long int match_call_count; /* As it says */ |
---|
| 1017 | unsigned long int match_limit; /* As it says */ |
---|
| 1018 | unsigned long int match_limit_recursion; /* As it says */ |
---|
| 1019 | int *offset_vector; /* Offset vector */ |
---|
| 1020 | int offset_end; /* One past the end */ |
---|
| 1021 | int offset_max; /* The maximum usable for return data */ |
---|
| 1022 | int nltype; /* Newline type */ |
---|
| 1023 | int nllen; /* Newline string length */ |
---|
| 1024 | uschar nl[4]; /* Newline string when fixed */ |
---|
| 1025 | const uschar *lcc; /* Points to lower casing table */ |
---|
| 1026 | const uschar *ctypes; /* Points to table of type maps */ |
---|
| 1027 | BOOL offset_overflow; /* Set if too many extractions */ |
---|
| 1028 | BOOL notbol; /* NOTBOL flag */ |
---|
| 1029 | BOOL noteol; /* NOTEOL flag */ |
---|
| 1030 | BOOL utf8; /* UTF8 flag */ |
---|
| 1031 | BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ |
---|
| 1032 | BOOL endonly; /* Dollar not before final \n */ |
---|
| 1033 | BOOL notempty; /* Empty string match not wanted */ |
---|
| 1034 | BOOL partial; /* PARTIAL flag */ |
---|
| 1035 | BOOL hitend; /* Hit the end of the subject at some point */ |
---|
| 1036 | BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ |
---|
| 1037 | const uschar *start_code; /* For use when recursing */ |
---|
| 1038 | USPTR start_subject; /* Start of the subject string */ |
---|
| 1039 | USPTR end_subject; /* End of the subject string */ |
---|
| 1040 | USPTR start_match_ptr; /* Start of matched string */ |
---|
| 1041 | USPTR end_match_ptr; /* Subject position at end match */ |
---|
| 1042 | int end_offset_top; /* Highwater mark at end of match */ |
---|
| 1043 | int capture_last; /* Most recent capture number */ |
---|
| 1044 | int start_offset; /* The start offset value */ |
---|
| 1045 | eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ |
---|
| 1046 | int eptrn; /* Next free eptrblock */ |
---|
| 1047 | recursion_info *recursive; /* Linked list of recursion data */ |
---|
| 1048 | void *callout_data; /* To pass back to callouts */ |
---|
| 1049 | } match_data; |
---|
| 1050 | |
---|
| 1051 | /* A similar structure is used for the same purpose by the DFA matching |
---|
| 1052 | functions. */ |
---|
| 1053 | |
---|
| 1054 | typedef struct dfa_match_data { |
---|
| 1055 | const uschar *start_code; /* Start of the compiled pattern */ |
---|
| 1056 | const uschar *start_subject; /* Start of the subject string */ |
---|
| 1057 | const uschar *end_subject; /* End of subject string */ |
---|
| 1058 | const uschar *tables; /* Character tables */ |
---|
| 1059 | int moptions; /* Match options */ |
---|
| 1060 | int poptions; /* Pattern options */ |
---|
| 1061 | int nltype; /* Newline type */ |
---|
| 1062 | int nllen; /* Newline string length */ |
---|
| 1063 | uschar nl[4]; /* Newline string when fixed */ |
---|
| 1064 | void *callout_data; /* To pass back to callouts */ |
---|
| 1065 | } dfa_match_data; |
---|
| 1066 | |
---|
| 1067 | /* Bit definitions for entries in the pcre_ctypes table. */ |
---|
| 1068 | |
---|
| 1069 | #define ctype_space 0x01 |
---|
| 1070 | #define ctype_letter 0x02 |
---|
| 1071 | #define ctype_digit 0x04 |
---|
| 1072 | #define ctype_xdigit 0x08 |
---|
| 1073 | #define ctype_word 0x10 /* alphanumeric or '_' */ |
---|
| 1074 | #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ |
---|
| 1075 | |
---|
| 1076 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set |
---|
| 1077 | of bits for a class map. Some classes are built by combining these tables. */ |
---|
| 1078 | |
---|
| 1079 | #define cbit_space 0 /* [:space:] or \s */ |
---|
| 1080 | #define cbit_xdigit 32 /* [:xdigit:] */ |
---|
| 1081 | #define cbit_digit 64 /* [:digit:] or \d */ |
---|
| 1082 | #define cbit_upper 96 /* [:upper:] */ |
---|
| 1083 | #define cbit_lower 128 /* [:lower:] */ |
---|
| 1084 | #define cbit_word 160 /* [:word:] or \w */ |
---|
| 1085 | #define cbit_graph 192 /* [:graph:] */ |
---|
| 1086 | #define cbit_print 224 /* [:print:] */ |
---|
| 1087 | #define cbit_punct 256 /* [:punct:] */ |
---|
| 1088 | #define cbit_cntrl 288 /* [:cntrl:] */ |
---|
| 1089 | #define cbit_length 320 /* Length of the cbits table */ |
---|
| 1090 | |
---|
| 1091 | /* Offsets of the various tables from the base tables pointer, and |
---|
| 1092 | total length. */ |
---|
| 1093 | |
---|
| 1094 | #define lcc_offset 0 |
---|
| 1095 | #define fcc_offset 256 |
---|
| 1096 | #define cbits_offset 512 |
---|
| 1097 | #define ctypes_offset (cbits_offset + cbit_length) |
---|
| 1098 | #define tables_length (ctypes_offset + 256) |
---|
| 1099 | |
---|
| 1100 | /* Layout of the UCP type table that translates property names into types and |
---|
| 1101 | codes. Each entry used to point directly to a name, but to reduce the number of |
---|
| 1102 | relocations in shared libraries, it now has an offset into a single string |
---|
| 1103 | instead. */ |
---|
| 1104 | |
---|
| 1105 | typedef struct { |
---|
| 1106 | pcre_uint16 name_offset; |
---|
| 1107 | pcre_uint16 type; |
---|
| 1108 | pcre_uint16 value; |
---|
| 1109 | } ucp_type_table; |
---|
| 1110 | |
---|
| 1111 | |
---|
| 1112 | /* Internal shared data tables. These are tables that are used by more than one |
---|
| 1113 | of the exported public functions. They have to be "external" in the C sense, |
---|
| 1114 | but are not part of the PCRE public API. The data for these tables is in the |
---|
| 1115 | pcre_tables.c module. */ |
---|
| 1116 | |
---|
| 1117 | extern const int _pcre_utf8_table1[]; |
---|
| 1118 | extern const int _pcre_utf8_table2[]; |
---|
| 1119 | extern const int _pcre_utf8_table3[]; |
---|
| 1120 | extern const uschar _pcre_utf8_table4[]; |
---|
| 1121 | |
---|
| 1122 | extern const int _pcre_utf8_table1_size; |
---|
| 1123 | |
---|
| 1124 | extern const char _pcre_utt_names[]; |
---|
| 1125 | extern const ucp_type_table _pcre_utt[]; |
---|
| 1126 | extern const int _pcre_utt_size; |
---|
| 1127 | |
---|
| 1128 | extern const uschar _pcre_default_tables[]; |
---|
| 1129 | |
---|
| 1130 | extern const uschar _pcre_OP_lengths[]; |
---|
| 1131 | |
---|
| 1132 | |
---|
| 1133 | /* Internal shared functions. These are functions that are used by more than |
---|
| 1134 | one of the exported public functions. They have to be "external" in the C |
---|
| 1135 | sense, but are not part of the PCRE public API. */ |
---|
| 1136 | |
---|
| 1137 | extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, |
---|
| 1138 | int *, BOOL); |
---|
| 1139 | extern int _pcre_ord2utf8(int, uschar *); |
---|
| 1140 | extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, |
---|
| 1141 | const pcre_study_data *, pcre_study_data *); |
---|
| 1142 | extern int _pcre_valid_utf8(const uschar *, int); |
---|
| 1143 | extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, |
---|
| 1144 | int *, BOOL); |
---|
| 1145 | extern BOOL _pcre_xclass(int, const uschar *); |
---|
| 1146 | |
---|
| 1147 | |
---|
| 1148 | /* Unicode character database (UCD) */ |
---|
| 1149 | |
---|
| 1150 | typedef struct { |
---|
| 1151 | uschar script; |
---|
| 1152 | uschar chartype; |
---|
| 1153 | pcre_int32 other_case; |
---|
| 1154 | } ucd_record; |
---|
| 1155 | |
---|
| 1156 | extern const ucd_record _pcre_ucd_records[]; |
---|
| 1157 | extern const uschar _pcre_ucd_stage1[]; |
---|
| 1158 | extern const pcre_uint16 _pcre_ucd_stage2[]; |
---|
| 1159 | extern const int _pcre_ucp_gentype[]; |
---|
| 1160 | |
---|
| 1161 | |
---|
| 1162 | /* UCD access macros */ |
---|
| 1163 | |
---|
| 1164 | #define UCD_BLOCK_SIZE 128 |
---|
| 1165 | #define GET_UCD(ch) (_pcre_ucd_records + \ |
---|
| 1166 | _pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \ |
---|
| 1167 | UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE]) |
---|
| 1168 | |
---|
| 1169 | #define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype |
---|
| 1170 | #define UCD_SCRIPT(ch) GET_UCD(ch)->script |
---|
| 1171 | #define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)] |
---|
| 1172 | #define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case) |
---|
| 1173 | |
---|
| 1174 | #endif |
---|
| 1175 | |
---|
| 1176 | /* End of pcre_internal.h */ |
---|