Path: usenet.cise.ufl.edu!usenet.eel.ufl.edu!www.nntp.primenet.com!nntp.primenet.com!feed1.news.erols.com!howland.erols.net!newsfeed.internetmci.com!in2.uu.net!192.108.254.3!news.teleport.com!not-for-mail From: chip@rio.atlantic.net (Chip Salzenberg) Newsgroups: comp.lang.perl.announce,comp.lang.perl.misc Subject: Patch to Perl 5.004 for case-insensitive patterns (REFCASE1) Followup-To: comp.lang.perl.misc Date: 20 May 1997 17:55:02 GMT Organization: Internet Connect Company +1(352)375-2912, USA Lines: 181 Sender: news-merlyn@gadget.cscaper.com Approved: merlyn@stonehenge.com (comp.lang.perl.announce) Message-ID: <5lsohm$m4t$1@nadine.teleport.com> NNTP-Posting-Host: gadget.cscaper.com X-Disclaimer: The "Approved" header verifies header information for article transmission and does not imply approval of content. Xref: usenet.cise.ufl.edu comp.lang.perl.announce:178 comp.lang.perl.misc:27731 In a case-insensitive pattern match (//i), back-references to previous parenthesized subpatterns (e.g. the \1 in "/(\w+)\s+\1/i") should be case-insensitive as well. In Perl 5.004, they aren't. This patch fixes that problem. Index: patchlevel.h *************** *** 41,42 **** --- 41,43 ---- + ,"REFCASE1 - fix for case-insensitivity in regex backreferences" ,NULL }; Index: regcomp.h *************** *** 73,95 **** #define NBOUND 22 /* no Match "" at any word non-boundary */ #define NBOUNDL 23 /* no Match "" at any word non-boundary */ ! #define REF 24 /* num Match some already matched string */ ! #define OPEN 25 /* num Mark this point in input as start of #n. */ ! #define CLOSE 26 /* num Analogous to OPEN. */ ! #define MINMOD 27 /* no Next operator is not greedy. */ ! #define GPOS 28 /* no Matches where last m//g left off. */ ! #define IFMATCH 29 /* no Succeeds if the following matches. */ ! #define UNLESSM 30 /* no Fails if the following matches. */ ! #define SUCCEED 31 /* no Return from a subroutine, basically. */ ! #define WHILEM 32 /* no Do curly processing and see if rest matches. */ ! #define ALNUM 33 /* no Match any alphanumeric character */ ! #define ALNUML 34 /* no Match any alphanumeric char in locale */ ! #define NALNUM 35 /* no Match any non-alphanumeric character */ ! #define NALNUML 36 /* no Match any non-alphanumeric char in locale */ ! #define SPACE 37 /* no Match any whitespace character */ ! #define SPACEL 38 /* no Match any whitespace char in locale */ ! #define NSPACE 39 /* no Match any non-whitespace character */ ! #define NSPACEL 40 /* no Match any non-whitespace char in locale */ ! #define DIGIT 41 /* no Match any numeric character */ ! #define NDIGIT 42 /* no Match any non-numeric character */ /* --- 73,97 ---- #define NBOUND 22 /* no Match "" at any word non-boundary */ #define NBOUNDL 23 /* no Match "" at any word non-boundary */ ! #define REF 24 /* num Match already matched string */ ! #define REFF 25 /* num Match already matched string, folded */ ! #define REFFL 26 /* num Match already matched string, folded in loc. */ ! #define OPEN 27 /* num Mark this point in input as start of #n. */ ! #define CLOSE 28 /* num Analogous to OPEN. */ ! #define MINMOD 29 /* no Next operator is not greedy. */ ! #define GPOS 30 /* no Matches where last m//g left off. */ ! #define IFMATCH 31 /* no Succeeds if the following matches. */ ! #define UNLESSM 32 /* no Fails if the following matches. */ ! #define SUCCEED 33 /* no Return from a subroutine, basically. */ ! #define WHILEM 34 /* no Do curly processing and see if rest matches. */ ! #define ALNUM 35 /* no Match any alphanumeric character */ ! #define ALNUML 36 /* no Match any alphanumeric char in locale */ ! #define NALNUM 37 /* no Match any non-alphanumeric character */ ! #define NALNUML 38 /* no Match any non-alphanumeric char in locale */ ! #define SPACE 39 /* no Match any whitespace character */ ! #define SPACEL 40 /* no Match any whitespace char in locale */ ! #define NSPACE 41 /* no Match any non-whitespace character */ ! #define NSPACEL 42 /* no Match any non-whitespace char in locale */ ! #define DIGIT 43 /* no Match any numeric character */ ! #define NDIGIT 44 /* no Match any non-numeric character */ /* *************** *** 122,126 **** /*CURLY*/ 4, /*CURLYX*/ 4, 0,0,0,0,0,0,0,0,0,0,0,0, ! /*REF*/ 2, /*OPEN*/ 2, /*CLOSE*/ 2, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; --- 124,128 ---- /*CURLY*/ 4, /*CURLYX*/ 4, 0,0,0,0,0,0,0,0,0,0,0,0, ! /*REF*/ 2, 2, 2, /*OPEN*/ 2, /*CLOSE*/ 2, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; *************** *** 156,159 **** --- 158,163 ---- NBOUND, REF, + REF, + REF, OPEN, CLOSE, *************** *** 182,186 **** #else EXT char varies[] = { ! BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, WHILEM, 0 }; #endif --- 186,190 ---- #else EXT char varies[] = { ! BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, WHILEM, 0 }; #endif Index: regcomp.c *************** tryagain: *** 904,908 **** else { regsawback = 1; ! ret = reganode(REF, num); *flagp |= HASWIDTH; while (isDIGIT(*regparse)) --- 904,910 ---- else { regsawback = 1; ! ret = reganode((regflags & PMf_FOLD) ! ? ((regflags & PMf_LOCALE) ? REFFL : REFF) ! : REF, num); *flagp |= HASWIDTH; while (isDIGIT(*regparse)) *************** char *op; *** 1667,1670 **** --- 1669,1678 ---- case REF: sv_catpvf(sv, "REF%d", ARG1(op)); + break; + case REFF: + sv_catpvf(sv, "REFF%d", ARG1(op)); + break; + case REFFL: + sv_catpvf(sv, "REFFL%d", ARG1(op)); break; case OPEN: Index: regexec.c *************** char *prog; *** 826,830 **** --- 826,834 ---- nextchar = UCHARAT(++locinput); break; + case REFFL: + regtainted = TRUE; + /* FALL THROUGH */ case REF: + case REFF: n = ARG1(scan); /* which paren pair */ s = regstartp[n]; *************** char *prog; *** 836,845 **** break; /* Inline the first character, for speed. */ ! if (UCHARAT(s) != nextchar) sayNO; ln = regendp[n] - s; if (locinput + ln > regeol) sayNO; ! if (ln > 1 && memNE(s, locinput, ln)) sayNO; locinput += ln; --- 840,856 ---- break; /* Inline the first character, for speed. */ ! if (UCHARAT(s) != nextchar && ! (OP(scan) == REF || ! (UCHARAT(s) != ((OP(scan) == REFF ! ? fold : fold_locale)[nextchar])))) sayNO; ln = regendp[n] - s; if (locinput + ln > regeol) sayNO; ! if (ln > 1 && (OP(scan) == REF ! ? memNE(s, locinput, ln) ! : (OP(scan) == REFF ! ? ibcmp(s, locinput, ln) ! : ibcmp_locale(s, locinput, ln)))) sayNO; locinput += ln; -- Chip Salzenberg - a.k.a. - "Most organizations reward individuals and groups that choose to re-invent the wheel." -- Bjarne Stroustrup .