| From ba6e2c38aafc23cf114f3ba0d0ff3baead34328b Mon Sep 17 00:00:00 2001 |
| From: Yves Orton <demerphq@gmail.com> |
| Date: Tue, 1 Aug 2023 23:12:46 +0200 |
| Subject: [PATCH] regcomp*.c, regexec.c - fixup regex engine build under |
| -Uusedl |
| |
| The regex engine is built a bit different from most of the perl |
| codebase. It is compiled as part of the main libperl.so and it is |
| also compiled (with DEBUGGING enabled) as part of the re extension. |
| When perl itself is compiled with DEBUGGING enabled then the code |
| in the re.so extension and the code in libperl.so is the same. |
| |
| This all works fine and dandy until you have a static build where the |
| re.so is linked into libperl.so, which results in duplicate symbols |
| being defined. These symbols come in two flaviours: "auxiliary" and |
| "debugging" related symbols. |
| |
| We have basically three cases: |
| |
| 1. USE_DYNAMIC_LOADING is defined. In this case we are doing a dynamic |
| build and re.so will be separate from libperl.so, so it even if this |
| is a DEBUGGING enabled build debug and auxiliary functions can be |
| compiled into *both* re.so and libperl.so. This is basically the |
| "standard build". |
| |
| 2. USE_DYNAMIC_LOADING is not defined, and DEBUGGING is not defined |
| either. In this case auxiliary functions should only be compiled in |
| libperl.so, and the debug functions should only be compiled into |
| re.so |
| |
| 3. USE_DYNAMIC_LOADING is not defined, and DEBUGGING *is* defined. In |
| this case auxiliary functions AND debug functions should only be |
| compiled into libperl.so |
| |
| It is possible to detect the different build options by looking at the |
| defines 'USE_DYNAMIC_LOADING', 'PERL_EXT_RE_DEBUG' and |
| 'DEBUGGING_RE_ONLY'. 'USE_DYNAMIC_LOADING' is NOT defined when we are |
| building a static perl. 'PERL_EXT_RE_DEBUG' is defined only when we are |
| building re.so, and 'DEBUGGING_RE_ONLY' is defined only when we are |
| building re.so in a perl that is not itself already a DEBUGGING enabled |
| perl. The file ext/re/re_top.h responsible for setting up |
| DEBUGGING_RE_ONLY. |
| |
| This patch uses 'PERL_EXT_RE_DEBUG', 'DEBUGGING_RE_ONLY' and |
| 'USE_DYNAMIC_LOADING' to define in regcomp.h two further define flags |
| 'PERL_RE_BUILD_DEBUG' and 'PERL_RE_BUILD_AUX'. |
| |
| The 'PERL_RE_BUILD_DEBUG' flag determines if the debugging functions |
| should be compiled into libperl.so or re.so or both. The |
| 'PERL_RE_BUILD_AUX' flag determines if the auxiliary functions should be |
| compiled into just libperl.so or into it and re.so. We then use these |
| flags to guard the different types of functions so that we can build in |
| all three modes without duplicate symbols. |
| |
| Upstream: https://github.com/Perl/perl5/commit/ba6e2c38aafc23cf114f3ba0d0ff3baead34328b |
| Signed-off-by: Fabrice Fontaine <fontaine.fabrice@gmail.com> |
| --- |
| regcomp.c | 13 +- |
| regcomp.h | 14 ++- |
| regcomp_debug.c | 308 +++++++++++++++++++++++----------------------- |
| regcomp_invlist.c | 3 +- |
| regexec.c | 3 +- |
| 5 files changed, 181 insertions(+), 160 deletions(-) |
| |
| diff --git a/regcomp.c b/regcomp.c |
| index d3c135fbfad1..6e35d95d2ac6 100644 |
| --- a/regcomp.c |
| +++ b/regcomp.c |
| @@ -290,6 +290,7 @@ S_edit_distance(const UV* src, |
| /* END of edit_distance() stuff |
| * ========================================================= */ |
| |
| +#ifdef PERL_RE_BUILD_AUX |
| /* add a data member to the struct reg_data attached to this regex, it should |
| * always return a non-zero return. the 's' argument is the type of the items |
| * being added and the n is the number of items. The length of 's' should match |
| @@ -340,6 +341,7 @@ Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U3 |
| assert(count>0); |
| return count; |
| } |
| +#endif /* PERL_RE_BUILD_AUX */ |
| |
| /*XXX: todo make this not included in a non debugging perl, but appears to be |
| * used anyway there, in 'use re' */ |
| @@ -7443,6 +7445,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) |
| } |
| |
| |
| +#ifdef PERL_RE_BUILD_AUX |
| void |
| Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) |
| { |
| @@ -7502,6 +7505,7 @@ Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) |
| } |
| } |
| } |
| +#endif /* PERL_RE_BUILD_AUX */ |
| |
| /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. |
| Character classes ([:foo:]) can also be negated ([:^foo:]). |
| @@ -9095,6 +9099,7 @@ S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, |
| #undef IS_OPERATOR |
| #undef IS_OPERAND |
| |
| +#ifdef PERL_RE_BUILD_AUX |
| void |
| Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) |
| { |
| @@ -9182,6 +9187,8 @@ Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** i |
| } |
| } |
| } |
| +#endif /* PERL_RE_BUILD_AUX */ |
| + |
| |
| STATIC void |
| S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings) |
| @@ -12105,6 +12112,7 @@ S_optimize_regclass(pTHX_ |
| |
| #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION |
| |
| +#ifdef PERL_RE_BUILD_AUX |
| void |
| Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, |
| regnode* const node, |
| @@ -12261,6 +12269,7 @@ Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, |
| RExC_rxi->data->data[n] = (void*)rv; |
| ARG1u_SET(node, n); |
| } |
| +#endif /* PERL_RE_BUILD_AUX */ |
| |
| SV * |
| |
| @@ -12999,6 +13008,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, |
| } |
| #endif |
| |
| + |
| +#ifdef PERL_RE_BUILD_AUX |
| SV* |
| Perl_get_ANYOFM_contents(pTHX_ const regnode * n) { |
| |
| @@ -13047,7 +13058,7 @@ Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { |
| UTF_CONTINUATION_MARK | 0)); |
| return cp_list; |
| } |
| - |
| +#endif /* PERL_RE_BUILD_AUX */ |
| |
| |
| SV * |
| diff --git a/regcomp.h b/regcomp.h |
| index 31c91e6a68e8..017a9f843514 100644 |
| --- a/regcomp.h |
| +++ b/regcomp.h |
| @@ -1554,7 +1554,19 @@ typedef enum { |
| #define EVAL_OPTIMISTIC_FLAG 128 |
| #define EVAL_FLAGS_MASK (EVAL_OPTIMISTIC_FLAG-1) |
| |
| - |
| +/* We define PERL_RE_BUILD_DEBUG if we are NOT compiling the re extension and |
| + * we are under DEBUGGING, or if we are ARE compiling the re extension |
| + * and this is not a DEBUGGING enabled build (identified by |
| + * DEBUGGING_RE_ONLY being defined) |
| + */ |
| +#if ( defined(USE_DYNAMIC_LOADING) && defined(DEBUGGING)) || \ |
| + ( defined(PERL_EXT_RE_BUILD) && defined(DEBUGGING_RE_ONLY)) || \ |
| + (!defined(PERL_EXT_RE_BUILD) && defined(DEBUGGING)) |
| +#define PERL_RE_BUILD_DEBUG |
| +#endif |
| +#if ( defined(USE_DYNAMIC_LOADING) || !defined(PERL_EXT_RE_BUILD) ) |
| +#define PERL_RE_BUILD_AUX |
| +#endif |
| |
| #endif /* PERL_REGCOMP_H_ */ |
| |
| diff --git a/regcomp_debug.c b/regcomp_debug.c |
| index 93db7a89cf48..96598c49c0bc 100644 |
| --- a/regcomp_debug.c |
| +++ b/regcomp_debug.c |
| @@ -18,8 +18,7 @@ |
| #include "unicode_constants.h" |
| #include "regcomp_internal.h" |
| |
| -#ifdef DEBUGGING |
| - |
| +#ifdef PERL_RE_BUILD_DEBUG |
| int |
| Perl_re_printf(pTHX_ const char *fmt, ...) |
| { |
| @@ -159,13 +158,160 @@ Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, |
| }); |
| } |
| |
| -#endif /* DEBUGGING */ |
| +const regnode * |
| +Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, |
| + const regnode *last, const regnode *plast, |
| + SV* sv, I32 indent, U32 depth) |
| +{ |
| + const regnode *next; |
| + const regnode *optstart= NULL; |
| + |
| + RXi_GET_DECL(r, ri); |
| + DECLARE_AND_GET_RE_DEBUG_FLAGS; |
| + |
| + PERL_ARGS_ASSERT_DUMPUNTIL; |
| + |
| +#ifdef DEBUG_DUMPUNTIL |
| + Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, |
| + last ? last-start : 0, plast ? plast-start : 0); |
| +#endif |
| + |
| + if (plast && plast < last) |
| + last= plast; |
| + |
| + while (node && (!last || node < last)) { |
| + const U8 op = OP(node); |
| + |
| + if (op == CLOSE || op == SRCLOSE || op == WHILEM) |
| + indent--; |
| + next = regnext((regnode *)node); |
| + const regnode *after = regnode_after((regnode *)node,0); |
| + |
| + /* Where, what. */ |
| + if (op == OPTIMIZED) { |
| + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) |
| + optstart = node; |
| + else |
| + goto after_print; |
| + } else |
| + CLEAR_OPTSTART; |
| + |
| + regprop(r, sv, node, NULL, NULL); |
| + Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), |
| + (int)(2*indent + 1), "", SvPVX_const(sv)); |
| + |
| + if (op != OPTIMIZED) { |
| + if (next == NULL) /* Next ptr. */ |
| + Perl_re_printf( aTHX_ " (0)"); |
| + else if (REGNODE_TYPE(op) == BRANCH |
| + && REGNODE_TYPE(OP(next)) != BRANCH ) |
| + Perl_re_printf( aTHX_ " (FAIL)"); |
| + else |
| + Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); |
| + Perl_re_printf( aTHX_ "\n"); |
| + } |
| + |
| + after_print: |
| + if (REGNODE_TYPE(op) == BRANCHJ) { |
| + assert(next); |
| + const regnode *nnode = (OP(next) == LONGJMP |
| + ? regnext((regnode *)next) |
| + : next); |
| + if (last && nnode > last) |
| + nnode = last; |
| + DUMPUNTIL(after, nnode); |
| + } |
| + else if (REGNODE_TYPE(op) == BRANCH) { |
| + assert(next); |
| + DUMPUNTIL(after, next); |
| + } |
| + else if ( REGNODE_TYPE(op) == TRIE ) { |
| + const regnode *this_trie = node; |
| + const U32 n = ARG1u(node); |
| + const reg_ac_data * const ac = op>=AHOCORASICK ? |
| + (reg_ac_data *)ri->data->data[n] : |
| + NULL; |
| + const reg_trie_data * const trie = |
| + (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; |
| +#ifdef DEBUGGING |
| + AV *const trie_words |
| + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); |
| +#endif |
| + const regnode *nextbranch= NULL; |
| + I32 word_idx; |
| + SvPVCLEAR(sv); |
| + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { |
| + SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); |
| + |
| + Perl_re_indentf( aTHX_ "%s ", |
| + indent+3, |
| + elem_ptr |
| + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), |
| + SvCUR(*elem_ptr), PL_dump_re_max_len, |
| + PL_colors[0], PL_colors[1], |
| + (SvUTF8(*elem_ptr) |
| + ? PERL_PV_ESCAPE_UNI |
| + : 0) |
| + | PERL_PV_PRETTY_ELLIPSES |
| + | PERL_PV_PRETTY_LTGT |
| + ) |
| + : "???" |
| + ); |
| + if (trie->jump) { |
| + U16 dist= trie->jump[word_idx+1]; |
| + Perl_re_printf( aTHX_ "(%" UVuf ")\n", |
| + (UV)((dist ? this_trie + dist : next) - start)); |
| + if (dist) { |
| + if (!nextbranch) |
| + nextbranch= this_trie + trie->jump[0]; |
| + DUMPUNTIL(this_trie + dist, nextbranch); |
| + } |
| + if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) |
| + nextbranch= regnext((regnode *)nextbranch); |
| + } else { |
| + Perl_re_printf( aTHX_ "\n"); |
| + } |
| + } |
| + if (last && next > last) |
| + node= last; |
| + else |
| + node= next; |
| + } |
| + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ |
| + DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ |
| + } |
| + else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { |
| + assert(next); |
| + DUMPUNTIL(after, next); |
| + } |
| + else if ( op == PLUS || op == STAR) { |
| + DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ |
| + } |
| + else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { |
| + /* Literal string, where present. */ |
| + node = (const regnode *)REGNODE_AFTER_varies(node); |
| + } |
| + else { |
| + node = REGNODE_AFTER_opcode(node,op); |
| + } |
| + if (op == CURLYX || op == OPEN || op == SROPEN) |
| + indent++; |
| + if (REGNODE_TYPE(op) == END) |
| + break; |
| + } |
| + CLEAR_OPTSTART; |
| +#ifdef DEBUG_DUMPUNTIL |
| + Perl_re_printf( aTHX_ "--- %d\n", (int)indent); |
| +#endif |
| + return node; |
| +} |
| + |
| +#endif /* PERL_RE_BUILD_DEBUG */ |
| |
| /* |
| - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form |
| */ |
| #ifdef DEBUGGING |
| - |
| static void |
| S_regdump_intflags(pTHX_ const char *lead, const U32 flags) |
| { |
| @@ -907,8 +1053,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ |
| #endif /* DEBUGGING */ |
| } |
| |
| -#ifdef DEBUGGING |
| |
| +#ifdef DEBUGGING |
| STATIC void |
| S_put_code_point(pTHX_ SV *sv, UV c) |
| { |
| @@ -1517,154 +1663,4 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, |
| |
| return did_output_something; |
| } |
| - |
| - |
| -const regnode * |
| -Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, |
| - const regnode *last, const regnode *plast, |
| - SV* sv, I32 indent, U32 depth) |
| -{ |
| - const regnode *next; |
| - const regnode *optstart= NULL; |
| - |
| - RXi_GET_DECL(r, ri); |
| - DECLARE_AND_GET_RE_DEBUG_FLAGS; |
| - |
| - PERL_ARGS_ASSERT_DUMPUNTIL; |
| - |
| -#ifdef DEBUG_DUMPUNTIL |
| - Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, |
| - last ? last-start : 0, plast ? plast-start : 0); |
| -#endif |
| - |
| - if (plast && plast < last) |
| - last= plast; |
| - |
| - while (node && (!last || node < last)) { |
| - const U8 op = OP(node); |
| - |
| - if (op == CLOSE || op == SRCLOSE || op == WHILEM) |
| - indent--; |
| - next = regnext((regnode *)node); |
| - const regnode *after = regnode_after((regnode *)node,0); |
| - |
| - /* Where, what. */ |
| - if (op == OPTIMIZED) { |
| - if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) |
| - optstart = node; |
| - else |
| - goto after_print; |
| - } else |
| - CLEAR_OPTSTART; |
| - |
| - regprop(r, sv, node, NULL, NULL); |
| - Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), |
| - (int)(2*indent + 1), "", SvPVX_const(sv)); |
| - |
| - if (op != OPTIMIZED) { |
| - if (next == NULL) /* Next ptr. */ |
| - Perl_re_printf( aTHX_ " (0)"); |
| - else if (REGNODE_TYPE(op) == BRANCH |
| - && REGNODE_TYPE(OP(next)) != BRANCH ) |
| - Perl_re_printf( aTHX_ " (FAIL)"); |
| - else |
| - Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); |
| - Perl_re_printf( aTHX_ "\n"); |
| - } |
| - |
| - after_print: |
| - if (REGNODE_TYPE(op) == BRANCHJ) { |
| - assert(next); |
| - const regnode *nnode = (OP(next) == LONGJMP |
| - ? regnext((regnode *)next) |
| - : next); |
| - if (last && nnode > last) |
| - nnode = last; |
| - DUMPUNTIL(after, nnode); |
| - } |
| - else if (REGNODE_TYPE(op) == BRANCH) { |
| - assert(next); |
| - DUMPUNTIL(after, next); |
| - } |
| - else if ( REGNODE_TYPE(op) == TRIE ) { |
| - const regnode *this_trie = node; |
| - const U32 n = ARG1u(node); |
| - const reg_ac_data * const ac = op>=AHOCORASICK ? |
| - (reg_ac_data *)ri->data->data[n] : |
| - NULL; |
| - const reg_trie_data * const trie = |
| - (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; |
| -#ifdef DEBUGGING |
| - AV *const trie_words |
| - = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); |
| -#endif |
| - const regnode *nextbranch= NULL; |
| - I32 word_idx; |
| - SvPVCLEAR(sv); |
| - for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { |
| - SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); |
| - |
| - Perl_re_indentf( aTHX_ "%s ", |
| - indent+3, |
| - elem_ptr |
| - ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), |
| - SvCUR(*elem_ptr), PL_dump_re_max_len, |
| - PL_colors[0], PL_colors[1], |
| - (SvUTF8(*elem_ptr) |
| - ? PERL_PV_ESCAPE_UNI |
| - : 0) |
| - | PERL_PV_PRETTY_ELLIPSES |
| - | PERL_PV_PRETTY_LTGT |
| - ) |
| - : "???" |
| - ); |
| - if (trie->jump) { |
| - U16 dist= trie->jump[word_idx+1]; |
| - Perl_re_printf( aTHX_ "(%" UVuf ")\n", |
| - (UV)((dist ? this_trie + dist : next) - start)); |
| - if (dist) { |
| - if (!nextbranch) |
| - nextbranch= this_trie + trie->jump[0]; |
| - DUMPUNTIL(this_trie + dist, nextbranch); |
| - } |
| - if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) |
| - nextbranch= regnext((regnode *)nextbranch); |
| - } else { |
| - Perl_re_printf( aTHX_ "\n"); |
| - } |
| - } |
| - if (last && next > last) |
| - node= last; |
| - else |
| - node= next; |
| - } |
| - else if ( op == CURLY ) { /* "next" might be very big: optimizer */ |
| - DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ |
| - } |
| - else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { |
| - assert(next); |
| - DUMPUNTIL(after, next); |
| - } |
| - else if ( op == PLUS || op == STAR) { |
| - DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ |
| - } |
| - else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { |
| - /* Literal string, where present. */ |
| - node = (const regnode *)REGNODE_AFTER_varies(node); |
| - } |
| - else { |
| - node = REGNODE_AFTER_opcode(node,op); |
| - } |
| - if (op == CURLYX || op == OPEN || op == SROPEN) |
| - indent++; |
| - if (REGNODE_TYPE(op) == END) |
| - break; |
| - } |
| - CLEAR_OPTSTART; |
| -#ifdef DEBUG_DUMPUNTIL |
| - Perl_re_printf( aTHX_ "--- %d\n", (int)indent); |
| -#endif |
| - return node; |
| -} |
| - |
| -#endif /* DEBUGGING */ |
| +#endif /* DEBUGGING */ |
| diff --git a/regcomp_invlist.c b/regcomp_invlist.c |
| index 9ea3f431817d..82f82305846a 100644 |
| --- a/regcomp_invlist.c |
| +++ b/regcomp_invlist.c |
| @@ -18,7 +18,7 @@ |
| #include "unicode_constants.h" |
| #include "regcomp_internal.h" |
| |
| - |
| +#ifdef PERL_RE_BUILD_AUX |
| void |
| Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len) |
| { |
| @@ -70,6 +70,7 @@ Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_l |
| } |
| } |
| } |
| +#endif /* PERL_RE_BUILD_AUX */ |
| |
| /* This section of code defines the inversion list object and its methods. The |
| * interfaces are highly subject to change, so as much as possible is static to |
| diff --git a/regexec.c b/regexec.c |
| index c404d9aa3d73..de0b7c461918 100644 |
| --- a/regexec.c |
| +++ b/regexec.c |
| @@ -4428,7 +4428,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) |
| */ |
| #define REPORT_CODE_OFF 29 |
| #define INDENT_CHARS(depth) ((int)(depth) % 20) |
| -#ifdef DEBUGGING |
| + |
| +#ifdef PERL_RE_BUILD_DEBUG |
| int |
| Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...) |
| { |