To: vim_dev@googlegroups.com Subject: Patch 7.3.1150 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ------------ Patch 7.3.1150 Problem: New regexpengine: Slow when a look-behind match does not have a width specified. Solution: Try to compute the maximum width. Files: src/regexp_nfa.c *** ../vim-7.3.1149/src/regexp_nfa.c 2013-06-08 18:19:39.000000000 +0200 --- src/regexp_nfa.c 2013-06-08 22:29:25.000000000 +0200 *************** *** 38,56 **** NFA_START_COLL, /* [abc] start */ NFA_END_COLL, /* [abc] end */ NFA_START_NEG_COLL, /* [^abc] start */ ! NFA_END_NEG_COLL, /* [^abc] end (only used in postfix) */ ! NFA_RANGE, /* range of the two previous items (only ! * used in postfix) */ NFA_RANGE_MIN, /* low end of a range */ NFA_RANGE_MAX, /* high end of a range */ ! NFA_CONCAT, /* concatenate two previous items (only ! * used in postfix) */ ! NFA_OR, ! NFA_STAR, /* greedy * */ ! NFA_STAR_NONGREEDY, /* non-greedy * */ ! NFA_QUEST, /* greedy \? */ ! NFA_QUEST_NONGREEDY, /* non-greedy \? */ NFA_BOL, /* ^ Begin line */ NFA_EOL, /* $ End line */ --- 38,56 ---- NFA_START_COLL, /* [abc] start */ NFA_END_COLL, /* [abc] end */ NFA_START_NEG_COLL, /* [^abc] start */ ! NFA_END_NEG_COLL, /* [^abc] end (postfix only) */ ! NFA_RANGE, /* range of the two previous items ! * (postfix only) */ NFA_RANGE_MIN, /* low end of a range */ NFA_RANGE_MAX, /* high end of a range */ ! NFA_CONCAT, /* concatenate two previous items (postfix ! * only) */ ! NFA_OR, /* \| (postfix only) */ ! NFA_STAR, /* greedy * (posfix only) */ ! NFA_STAR_NONGREEDY, /* non-greedy * (postfix only) */ ! NFA_QUEST, /* greedy \? (postfix only) */ ! NFA_QUEST_NONGREEDY, /* non-greedy \? (postfix only) */ NFA_BOL, /* ^ Begin line */ NFA_EOL, /* $ End line */ *************** *** 153,160 **** /* NFA_FIRST_NL */ NFA_ANY, /* Match any one character. */ - NFA_ANYOF, /* Match any character in this string. */ - NFA_ANYBUT, /* Match any character not in this string. */ NFA_IDENT, /* Match identifier char */ NFA_SIDENT, /* Match identifier char but no digit */ NFA_KWORD, /* Match keyword char */ --- 153,158 ---- *************** *** 496,503 **** /* * Figure out if the NFA state list contains just literal text and nothing ! * else. If so return a string with what must match after regstart. ! * Otherwise return NULL. */ static char_u * nfa_get_match_text(start) --- 494,501 ---- /* * Figure out if the NFA state list contains just literal text and nothing ! * else. If so return a string in allocated memory with what must match after ! * regstart. Otherwise return NULL. */ static char_u * nfa_get_match_text(start) *************** *** 2578,2583 **** --- 2576,2800 ---- } /* + * Estimate the maximum byte length of anything matching "state". + * When unknown or unlimited return -1. + */ + static int + nfa_max_width(startstate, depth) + nfa_state_T *startstate; + int depth; + { + int l, r; + nfa_state_T *state = startstate; + int len = 0; + + /* detect looping in a NFA_SPLIT */ + if (depth > 4) + return -1; + + for (;;) + { + switch (state->c) + { + case NFA_END_INVISIBLE: + case NFA_END_INVISIBLE_NEG: + /* the end, return what we have */ + return len; + + case NFA_SPLIT: + /* two alternatives, use the maximum */ + l = nfa_max_width(state->out, depth + 1); + r = nfa_max_width(state->out1, depth + 1); + if (l < 0 || r < 0) + return -1; + return len + (l > r ? l : r); + + case NFA_ANY: + case NFA_START_COLL: + case NFA_START_NEG_COLL: + /* matches some character, including composing chars */ + #ifdef FEAT_MBYTE + if (enc_utf8) + len += MB_MAXBYTES; + else if (has_mbyte) + len += 2; + else + #endif + ++len; + if (state->c != NFA_ANY) + { + /* skip over the characters */ + state = state->out1->out; + continue; + } + break; + + case NFA_DIGIT: + case NFA_WHITE: + case NFA_HEX: + case NFA_OCTAL: + /* ascii */ + ++len; + break; + + case NFA_IDENT: + case NFA_SIDENT: + case NFA_KWORD: + case NFA_SKWORD: + case NFA_FNAME: + case NFA_SFNAME: + case NFA_PRINT: + case NFA_SPRINT: + case NFA_NWHITE: + case NFA_NDIGIT: + case NFA_NHEX: + case NFA_NOCTAL: + case NFA_WORD: + case NFA_NWORD: + case NFA_HEAD: + case NFA_NHEAD: + case NFA_ALPHA: + case NFA_NALPHA: + case NFA_LOWER: + case NFA_NLOWER: + case NFA_UPPER: + case NFA_NUPPER: + /* possibly non-ascii */ + #ifdef FEAT_MBYTE + if (has_mbyte) + len += 3; + else + #endif + ++len; + break; + + case NFA_START_INVISIBLE: + case NFA_START_INVISIBLE_NEG: + case NFA_START_INVISIBLE_BEFORE: + case NFA_START_INVISIBLE_BEFORE_NEG: + /* zero-width, out1 points to the END state */ + state = state->out1->out; + continue; + + case NFA_BACKREF1: + case NFA_BACKREF2: + case NFA_BACKREF3: + case NFA_BACKREF4: + case NFA_BACKREF5: + case NFA_BACKREF6: + case NFA_BACKREF7: + case NFA_BACKREF8: + case NFA_BACKREF9: + #ifdef FEAT_SYN_HL + case NFA_ZREF1: + case NFA_ZREF2: + case NFA_ZREF3: + case NFA_ZREF4: + case NFA_ZREF5: + case NFA_ZREF6: + case NFA_ZREF7: + case NFA_ZREF8: + case NFA_ZREF9: + #endif + case NFA_NEWL: + case NFA_SKIP: + /* unknown width */ + return -1; + + case NFA_BOL: + case NFA_EOL: + case NFA_BOF: + case NFA_EOF: + case NFA_BOW: + case NFA_EOW: + case NFA_MOPEN: + case NFA_MOPEN1: + case NFA_MOPEN2: + case NFA_MOPEN3: + case NFA_MOPEN4: + case NFA_MOPEN5: + case NFA_MOPEN6: + case NFA_MOPEN7: + case NFA_MOPEN8: + case NFA_MOPEN9: + #ifdef FEAT_SYN_HL + case NFA_ZOPEN: + case NFA_ZOPEN1: + case NFA_ZOPEN2: + case NFA_ZOPEN3: + case NFA_ZOPEN4: + case NFA_ZOPEN5: + case NFA_ZOPEN6: + case NFA_ZOPEN7: + case NFA_ZOPEN8: + case NFA_ZOPEN9: + case NFA_ZCLOSE: + case NFA_ZCLOSE1: + case NFA_ZCLOSE2: + case NFA_ZCLOSE3: + case NFA_ZCLOSE4: + case NFA_ZCLOSE5: + case NFA_ZCLOSE6: + case NFA_ZCLOSE7: + case NFA_ZCLOSE8: + case NFA_ZCLOSE9: + #endif + case NFA_MCLOSE: + case NFA_MCLOSE1: + case NFA_MCLOSE2: + case NFA_MCLOSE3: + case NFA_MCLOSE4: + case NFA_MCLOSE5: + case NFA_MCLOSE6: + case NFA_MCLOSE7: + case NFA_MCLOSE8: + case NFA_MCLOSE9: + case NFA_NOPEN: + case NFA_NCLOSE: + + case NFA_LNUM_GT: + case NFA_LNUM_LT: + case NFA_COL_GT: + case NFA_COL_LT: + case NFA_VCOL_GT: + case NFA_VCOL_LT: + case NFA_MARK_GT: + case NFA_MARK_LT: + case NFA_VISUAL: + case NFA_LNUM: + case NFA_CURSOR: + case NFA_COL: + case NFA_VCOL: + case NFA_MARK: + + case NFA_ZSTART: + case NFA_ZEND: + case NFA_OPT_CHARS: + case NFA_SKIP_CHAR: + case NFA_START_PATTERN: + case NFA_END_PATTERN: + case NFA_COMPOSING: + case NFA_END_COMPOSING: + /* zero-width */ + break; + + default: + if (state->c < 0) + /* don't know what this is */ + return -1; + /* normal character */ + len += MB_CHAR2LEN(state->c); + break; + } + + /* normal way to continue */ + state = state->out; + } + + /* unrecognized */ + return -1; + } + /* * Convert a postfix form into its equivalent NFA. * Return the NFA start state on success, NULL otherwise. */ *************** *** 2856,2863 **** s = alloc_state(start_state, e.start, s1); if (s == NULL) goto theend; - if (before) - s->val = n; /* store the count */ if (pattern) { /* NFA_ZEND -> NFA_END_PATTERN -> NFA_SKIP -> what follows. */ --- 3073,3078 ---- *************** *** 2871,2876 **** --- 3086,3099 ---- { patch(e.out, s1); PUSH(frag(s, list1(&s1->out))); + if (before) + { + if (n <= 0) + /* See if we can guess the maximum width, it avoids a + * lot of pointless tries. */ + n = nfa_max_width(e.start, 0); + s->val = n; /* store the count */ + } } break; } *************** *** 4088,4096 **** /* Go back the specified number of bytes, or as far as the * start of the previous line, to try matching "\@<=" or ! * not matching "\@val <= 0) { if (REG_MULTI) --- 4311,4318 ---- /* Go back the specified number of bytes, or as far as the * start of the previous line, to try matching "\@<=" or ! * not matching "\@val <= 0) { if (REG_MULTI) *************** *** 4386,4392 **** /* * Check for a match with match_text. ! * Called after skip_to_start() has find regstart. * Returns zero for no match, 1 for a match. */ static long --- 4608,4614 ---- /* * Check for a match with match_text. ! * Called after skip_to_start() has found regstart. * Returns zero for no match, 1 for a match. */ static long *************** *** 4736,4742 **** #ifdef FEAT_SYN_HL || (cout >= NFA_ZCLOSE && cout <= NFA_ZCLOSE9) #endif - || cout == NFA_NCLOSE || t->pim != NULL || (t->state->c != NFA_START_INVISIBLE_BEFORE && t->state->c != NFA_START_INVISIBLE_BEFORE_NEG --- 4958,4963 ---- *** ../vim-7.3.1149/src/version.c 2013-06-08 18:19:40.000000000 +0200 --- src/version.c 2013-06-08 22:15:27.000000000 +0200 *************** *** 730,731 **** --- 730,733 ---- { /* Add new patch number below this line */ + /**/ + 1150, /**/ -- Amnesia is one of my favorite words, but I forgot what it means. /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ an exciting new programming language -- http://www.Zimbu.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///