javascript - Regex to not match partial sequences, but match full ones -
i have escaped html this:
<img border='0' /> i'm trying match , replace full escape sequences ' not partial, 39, since 39 not in unescaped string. essentially, each escape sequence should treated single token.
this js regex. there way exclude matches between & , ; while still accepting sequences include both of characters?
desired results:
- search
<img border='0' />lt: no match. - search
<img border='0' />39: no match. - search
<img border='0' />': match. - search
<img border='0' />border=': match.
current code:
> var str = '<img border='0' />' > str.replace(/(border)/gi, '|$1|') '<img |border|='0' />' // ok > str.replace(/(39)/gi, '|$1|') '<img border=�|39|;0�|39|; />' // not ok note: can't unescape , re-escape match. has escaped.
the op wants javascript regex match , replace string within escaped html while treating escape sequences (e.g. <, ', or ) single characters, , not unescape html string during replacement process.
this means replacing
"lt""[lt]"in"< lt"result in"< [lt]"(avoid match within entity)"<""[<]"in"< lt"result in"[<] lt"(match entity)"&l""[&l]"in"< <"result in"< [&l]t"(not match partial entity)"t;""[t;]"in"< lt;"result in"< l[t;]"(not match partial entity)"< l""[< l]"in"< lt"result in"[< l]t"(match including entity)"lt; &l""[lt; &l]"in"< <"result in"< <"(not match partial entity)"t; <""[t; <]"in"lt; <"result in"l[t; <]"(match including entity)"t; <""[t; <]"in"lt; <"result in"lt; <"(not match partial entity)
with following regex capturing escaped sequences (e.g. <, ', or ),
/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi we may use following function starting point handles of cases above (#1, #2, #4, #5, , #7):
function searchandreplace(searchfor, replacement, str) { return str.replace( new regexp( prepare(searchfor) + "|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities "gi" ), function(m, entity) { return entity || replacement; } ); } function prepare(str) { return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1] } // [1] http://eloquentjavascript.net/09_regexp.html#h_rhu25fogrg the remaining cases (#3, #6, #8) involve potential partial escaped sequence @ end of search string.
a solution check searchfor string potential partial escaped sequences @ end , append corresponding negated lookahead (?!) prevent matching valid escaped sequence. full solution (passing set of 40 test cases) shown below, , should faster , less complex .exec() approach:
function searchandreplace(searchfor, replacement, str) { return str.replace( new regexp( prepare(searchfor) + "|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)", "gi" ), function(m, entity) { return entity || replacement; } ); } function prepare(str) { var add = ""; if (/&$/.test(str)) { add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)"; } else if (/&[a-z]+$/i.test(str)) { add = "(?![a-z]*;)"; } else if (/&#$/.test(str)) { add = "(?!x[a-f\\d]+;|\\d+;)"; } else if (/&#x$/.test(str)) { add = "(?![a-f\\d]+;)"; } else if (/&#x[a-f\d]+$/i.test(str)) { add = "(?![a-f\\d]*;)"; } return str.replace(/[^\w\s]/g, "\\$&") + add; } // test function function test(searchfor, replacement, str, expected) { var result = searchandreplace(searchfor, replacement, str); console.log( searchfor + ": " + (result === expected ? "passed" : "failed: " + [expected, result]) ); } // test cases test("lt", "[lt]", "<img border='0' />", "<img border='0' />"); test("39", "[39]", "<img border='0' />", "<img border='0' />"); test("'", "[']", "<img border='0' />", "<img border=[']0['] />"); test("border='", "[border=']", "<img border='0' />", "<img [border=']0' />"); test("39&", "[39&]", "39<img border=39'>>&' t; 0'&39; />", "39<img border=39'>>&' t; 0'&39; />") test("0&#", "[0&#]", "39<img border=39'>>&' t; 0'&39; />", "39<img border=39'>>&' t; 0'&39; />") test("lt", "[]", "<<t;t&l", "&[]<t;t&l"); test("<", "[]", "<<t;t&l", "<[]t;t&l"); test("&l", "[]", "<<t;t&l", "[]t<t;t[]"); test("t;", "[]", "<<t;t&l", "<<[]t&l"); test("t&", "[]", "<<t;t&l", "<<t;[]l"); test("<t", "[]", "<<t;t&l", "<[];t&l"); test("t<", "[]", "<<t;t&l", "&l[]t;t&l"); test("t;t", "[]", "<<t;t&l", "<<[]&l"); test("t&l", "[]", "<<t;t&l", "<<t;[]"); test("39", "[]", "''9;9&#", "�[]'9;9&#"); test("'", "[]", "''9;9&#", "'[]9;9&#"); test("&", "[]", "''9;9&#", "[]#039'9;9[]#"); test("&#", "[]", "''9;9&#", "[]039'9;9[]"); test("9;", "[]", "''9;9&#", "''[]9&#"); test("9&", "[]", "''9;9&#", "''9;[]#"); test("'9", "[]", "''9;9&#", "'[];9&#"); test("9'", "[]", "''9;9&#", "[]9;9&#"); test("9;9", "[]", "''9;9&#", "''[]&#"); test("9&#", "[]", "''9;9&#", "''9;[]"); test("x7", "[]", "f;f&#x", "&#[]ff;f&#x"); test("", "[]", "f;f&#x", "[]f;f&#x"); test("&", "[]", "f;f&#x", "[]#x7ff;f[]#x"); test("&#", "[]", "f;f&#x", "[]x7ff;f[]x"); test("&#x", "[]", "f;f&#x", "[]7ff;f[]"); test("", "[]", "f;f&#x", "[]ff;f&#x"); test("f;", "[]", "f;f&#x", "[]f&#x"); test("f&", "[]", "f;f&#x", "f;[]#x"); test("f", "[]", "f;f&#x", "[];f&#x"); test("f", "[]", "f;f&#x", "[]f;f&#x"); test("f;f", "[]", "f;f&#x", "[]&#x"); test("f&#", "[]", "f;f&#x", "f;[]x"); test("f&#x", "[]", "f;f&#x", "f;[]"); test("t; < lt &l", "[]", "< < lt <lt; < lt <", "< < lt <l[]t");
Comments
Post a Comment