r595 | ph10 | 2011-05-02 12:33:29 +0200 (Po, 02 kvě 2011) | 3 lines Fix problems with caseless reference matching in UTF-8 mode when the upper/lower case characters have different lengths. and r597 | ph10 | 2011-05-02 19:08:52 +0200 (Po, 02 kvě 2011) | 2 lines Complete incomplete fix for UTF-8 caseless references of different lengths. http://bugs.exim.org/show_bug.cgi?id=1074 Petr Pisar: Changelog and comment changes removed. Index: testdata/testoutput12 =================================================================== --- testdata/testoutput12 (revision 594) +++ testdata/testoutput12 (revision 595) @@ -1176,4 +1176,64 @@ End ------------------------------------------------------------------ +/-- These behaved oddly in Perl, so they are kept in this test --/ + +/(\x{23a}\x{23a}\x{23a})?\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} +No match + +/(ȺȺȺ)?\1/8i + ȺȺȺⱥⱥ +No match + +/(\x{23a}\x{23a}\x{23a})?\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + 1: \x{23a}\x{23a}\x{23a} + +/(ȺȺȺ)?\1/8i + ȺȺȺⱥⱥⱥ + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + 1: \x{23a}\x{23a}\x{23a} + +/(\x{23a}\x{23a}\x{23a})\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} +No match + +/(ȺȺȺ)\1/8i + ȺȺȺⱥⱥ +No match + +/(\x{23a}\x{23a}\x{23a})\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + 1: \x{23a}\x{23a}\x{23a} + +/(ȺȺȺ)\1/8i + ȺȺȺⱥⱥⱥ + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + 1: \x{23a}\x{23a}\x{23a} + +/(\x{2c65}\x{2c65})\1/8i + \x{2c65}\x{2c65}\x{23a}\x{23a} + 0: \x{2c65}\x{2c65}\x{23a}\x{23a} + 1: \x{2c65}\x{2c65} + +/(ⱥⱥ)\1/8i + ⱥⱥȺȺ + 0: \x{2c65}\x{2c65}\x{23a}\x{23a} + 1: \x{2c65}\x{2c65} + +/(\x{23a}\x{23a}\x{23a})\1Y/8i + X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ + 0: \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}Y + 1: \x{23a}\x{23a}\x{23a} + +/(\x{2c65}\x{2c65})\1Y/8i + X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ + 0: \x{2c65}\x{2c65}\x{23a}\x{23a}Y + 1: \x{2c65}\x{2c65} + +/-- --/ + /-- End of testinput12 --/ Index: testdata/testinput12 =================================================================== --- testdata/testinput12 (revision 594) +++ testdata/testinput12 (revision 595) @@ -503,4 +503,44 @@ /A+\p{N}A+\dB+\p{N}*B+\d*/WBZ +/-- These behaved oddly in Perl, so they are kept in this test --/ + +/(\x{23a}\x{23a}\x{23a})?\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} + +/(ȺȺȺ)?\1/8i + ȺȺȺⱥⱥ + +/(\x{23a}\x{23a}\x{23a})?\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + +/(ȺȺȺ)?\1/8i + ȺȺȺⱥⱥⱥ + +/(\x{23a}\x{23a}\x{23a})\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65} + +/(ȺȺȺ)\1/8i + ȺȺȺⱥⱥ + +/(\x{23a}\x{23a}\x{23a})\1/8i + \x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65} + +/(ȺȺȺ)\1/8i + ȺȺȺⱥⱥⱥ + +/(\x{2c65}\x{2c65})\1/8i + \x{2c65}\x{2c65}\x{23a}\x{23a} + +/(ⱥⱥ)\1/8i + ⱥⱥȺȺ + +/(\x{23a}\x{23a}\x{23a})\1Y/8i + X\x{23a}\x{23a}\x{23a}\x{2c65}\x{2c65}\x{2c65}YZ + +/(\x{2c65}\x{2c65})\1Y/8i + X\x{2c65}\x{2c65}\x{23a}\x{23a}YZ + +/-- --/ + /-- End of testinput12 --/ Index: pcre_exec.c =================================================================== --- pcre_exec.c (revision 594) +++ pcre_exec.c (revision 595) @@ -132,24 +132,27 @@ * Match a back-reference * *************************************************/ -/* If a back reference hasn't been set, the length that is passed is greater -than the number of characters left in the string, so the match fails. +/* Normally, if a back reference hasn't been set, the length that is passed is +negative, so the match always fails. However, in JavaScript compatibility mode, +the length passed is zero. Note that in caseless UTF-8 mode, the number of +subject bytes matched may be different to the number of reference bytes. Arguments: offset index into the offset vector - eptr points into the subject - length length to be matched + eptr pointer into the subject + length length of reference to be matched (number of bytes) md points to match data block ims the ims flags -Returns: TRUE if matched +Returns: < 0 if not matched, otherwise the number of subject bytes matched */ -static BOOL +static int match_ref(int offset, register USPTR eptr, int length, match_data *md, unsigned long int ims) { -USPTR p = md->start_subject + md->offset_vector[offset]; +USPTR eptr_start = eptr; +register USPTR p = md->start_subject + md->offset_vector[offset]; #ifdef PCRE_DEBUG if (eptr >= md->end_subject) @@ -164,9 +167,9 @@ printf("\n"); #endif -/* Always fail if not enough characters left */ +/* Always fail if reference not set (and not JavaScript compatible). */ -if (length > md->end_subject - eptr) return FALSE; +if (length < 0) return -1; /* Separate the caseless case for speed. In UTF-8 mode we can only do this properly if Unicode properties are supported. Otherwise, we can check only @@ -178,13 +181,21 @@ #ifdef SUPPORT_UCP if (md->utf8) { - USPTR endptr = eptr + length; - while (eptr < endptr) + /* Match characters up to the end of the reference. NOTE: the number of + bytes matched may differ, because there are some characters whose upper and + lower case versions code as different numbers of bytes. For example, U+023A + (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8); + a sequence of 3 of the former uses 6 bytes, as does a sequence of two of + the latter. It is important, therefore, to check the length along the + reference, not along the subject (earlier code did this wrong). */ + + USPTR endptr = p + length; + while (p < endptr) { int c, d; GETCHARINC(c, eptr); GETCHARINC(d, p); - if (c != d && c != UCD_OTHERCASE(d)) return FALSE; + if (c != d && c != UCD_OTHERCASE(d)) return -1; } } else @@ -195,16 +206,16 @@ is no UCP support. */ while (length-- > 0) - { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } + { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } } /* In the caseful case, we can just compare the bytes, whether or not we are in UTF-8 mode. */ else - { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } + { while (length-- > 0) if (*p++ != *eptr++) return -1; } -return TRUE; +return eptr - eptr_start; } @@ -2252,129 +2263,129 @@ loops). */ case OP_REF: - { - offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 3; + offset = GET2(ecode, 1) << 1; /* Doubled ref number */ + ecode += 3; - /* If the reference is unset, there are two possibilities: + /* If the reference is unset, there are two possibilities: - (a) In the default, Perl-compatible state, set the length to be longer - than the amount of subject left; this ensures that every attempt at a - match fails. We can't just fail here, because of the possibility of - quantifiers with zero minima. + (a) In the default, Perl-compatible state, set the length negative; + this ensures that every attempt at a match fails. We can't just fail + here, because of the possibility of quantifiers with zero minima. - (b) If the JavaScript compatibility flag is set, set the length to zero - so that the back reference matches an empty string. + (b) If the JavaScript compatibility flag is set, set the length to zero + so that the back reference matches an empty string. - Otherwise, set the length to the length of what was matched by the - referenced subpattern. */ + Otherwise, set the length to the length of what was matched by the + referenced subpattern. */ - if (offset >= offset_top || md->offset_vector[offset] < 0) - length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1); - else - length = md->offset_vector[offset+1] - md->offset_vector[offset]; + if (offset >= offset_top || md->offset_vector[offset] < 0) + length = (md->jscript_compat)? 0 : -1; + else + length = md->offset_vector[offset+1] - md->offset_vector[offset]; - /* Set up for repetition, or handle the non-repeated case */ + /* Set up for repetition, or handle the non-repeated case */ - switch (*ecode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; - min = rep_min[c]; /* Pick up values from tables; */ - max = rep_max[c]; /* zero for max => infinity */ - if (max == 0) max = INT_MAX; - break; + switch (*ecode) + { + case OP_CRSTAR: + case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRQUERY: + case OP_CRMINQUERY: + c = *ecode++ - OP_CRSTAR; + minimize = (c & 1) != 0; + min = rep_min[c]; /* Pick up values from tables; */ + max = rep_max[c]; /* zero for max => infinity */ + if (max == 0) max = INT_MAX; + break; - case OP_CRRANGE: - case OP_CRMINRANGE: - minimize = (*ecode == OP_CRMINRANGE); - min = GET2(ecode, 1); - max = GET2(ecode, 3); - if (max == 0) max = INT_MAX; - ecode += 5; - break; + case OP_CRRANGE: + case OP_CRMINRANGE: + minimize = (*ecode == OP_CRMINRANGE); + min = GET2(ecode, 1); + max = GET2(ecode, 3); + if (max == 0) max = INT_MAX; + ecode += 5; + break; - default: /* No repeat follows */ - if (!match_ref(offset, eptr, length, md, ims)) - { - CHECK_PARTIAL(); - MRRETURN(MATCH_NOMATCH); - } - eptr += length; - continue; /* With the main loop */ + default: /* No repeat follows */ + if ((length = match_ref(offset, eptr, length, md, ims)) < 0) + { + CHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); } + eptr += length; + continue; /* With the main loop */ + } - /* If the length of the reference is zero, just continue with the - main loop. */ + /* Handle repeated back references. If the length of the reference is + zero, just continue with the main loop. */ - if (length == 0) continue; + if (length == 0) continue; - /* First, ensure the minimum number of matches are present. We get back - the length of the reference string explicitly rather than passing the - address of eptr, so that eptr can be a register variable. */ + /* First, ensure the minimum number of matches are present. We get back + the length of the reference string explicitly rather than passing the + address of eptr, so that eptr can be a register variable. */ - for (i = 1; i <= min; i++) + for (i = 1; i <= min; i++) + { + int slength; + if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) { - if (!match_ref(offset, eptr, length, md, ims)) - { - CHECK_PARTIAL(); - MRRETURN(MATCH_NOMATCH); - } - eptr += length; + CHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); } + eptr += slength; + } - /* If min = max, continue at the same level without recursion. - They are not both allowed to be zero. */ + /* If min = max, continue at the same level without recursion. + They are not both allowed to be zero. */ - if (min == max) continue; + if (min == max) continue; - /* If minimizing, keep trying and advancing the pointer */ + /* If minimizing, keep trying and advancing the pointer */ - if (minimize) + if (minimize) + { + for (fi = min;; fi++) { - for (fi = min;; fi++) + int slength; + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max) MRRETURN(MATCH_NOMATCH); + if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max) MRRETURN(MATCH_NOMATCH); - if (!match_ref(offset, eptr, length, md, ims)) - { - CHECK_PARTIAL(); - MRRETURN(MATCH_NOMATCH); - } - eptr += length; + CHECK_PARTIAL(); + MRRETURN(MATCH_NOMATCH); } - /* Control never gets here */ + eptr += slength; } + /* Control never gets here */ + } - /* If maximizing, find the longest string and work backwards */ + /* If maximizing, find the longest string and work backwards */ - else + else + { + pp = eptr; + for (i = min; i < max; i++) { - pp = eptr; - for (i = min; i < max; i++) + int slength; + if ((slength = match_ref(offset, eptr, length, md, ims)) < 0) { - if (!match_ref(offset, eptr, length, md, ims)) - { - CHECK_PARTIAL(); - break; - } - eptr += length; + CHECK_PARTIAL(); + break; } - while (eptr >= pp) - { - RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - eptr -= length; - } - MRRETURN(MATCH_NOMATCH); + eptr += slength; } + while (eptr >= pp) + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr -= length; + } + MRRETURN(MATCH_NOMATCH); } /* Control never gets here */ Index: testdata/testinput1 =================================================================== --- testdata/testinput1 (revision 596) +++ testdata/testinput1 (revision 597) @@ -4079,4 +4079,10 @@ /^\c/ ? +/(abc)\1/i + abc + +/(abc)\1/ + abc + /-- End of testinput1 --/ Index: testdata/testoutput1 =================================================================== --- testdata/testoutput1 (revision 596) +++ testdata/testoutput1 (revision 597) @@ -6666,4 +6666,12 @@ ? 0: ? +/(abc)\1/i + abc +No match + +/(abc)\1/ + abc +No match + /-- End of testinput1 --/ Index: testdata/testinput4 =================================================================== --- testdata/testinput4 (revision 596) +++ testdata/testinput4 (revision 597) @@ -644,4 +644,10 @@ /A*/g8 AAB\x{123}BAA +/(abc)\1/8i + abc + +/(abc)\1/8 + abc + /-- End of testinput4 --/ Index: testdata/testoutput4 =================================================================== --- testdata/testoutput4 (revision 596) +++ testdata/testoutput4 (revision 597) @@ -1128,4 +1128,12 @@ 0: AA 0: +/(abc)\1/8i + abc +No match + +/(abc)\1/8 + abc +No match + /-- End of testinput4 --/ Index: pcre_exec.c =================================================================== --- pcre_exec.c (revision 596) +++ pcre_exec.c (revision 597) @@ -193,6 +193,7 @@ while (p < endptr) { int c, d; + if (eptr >= md->end_subject) return -1; GETCHARINC(c, eptr); GETCHARINC(d, p); if (c != d && c != UCD_OTHERCASE(d)) return -1; @@ -204,16 +205,21 @@ /* The same code works when not in UTF-8 mode and in UTF-8 mode when there is no UCP support. */ - - while (length-- > 0) - { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } + { + if (eptr + length > md->end_subject) return -1; + while (length-- > 0) + { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; } + } } /* In the caseful case, we can just compare the bytes, whether or not we are in UTF-8 mode. */ else - { while (length-- > 0) if (*p++ != *eptr++) return -1; } + { + if (eptr + length > md->end_subject) return -1; + while (length-- > 0) if (*p++ != *eptr++) return -1; + } return eptr - eptr_start; }