javascript - Regex to not match partial sequences, but match full ones -
i have escaped html this:
<img border='0' />
i'm trying match , replace full escape sequences '
not partial, 39
, since 39
not in unescaped string. essentially, each escape sequence should treated single token.
this js regex. there way exclude matches between &
, ;
while still accepting sequences include both of characters?
desired results:
- search
<img border='0' />
lt
: no match. - search
<img border='0' />
39
: no match. - search
<img border='0' />
'
: match. - search
<img border='0' />
border='
: match.
current code:
> var str = '<img border='0' />' > str.replace(/(border)/gi, '|$1|') '<img |border|='0' />' // ok > str.replace(/(39)/gi, '|$1|') '<img border=�|39|;0�|39|; />' // not ok
note: can't unescape , re-escape match. has escaped.
the op wants javascript regex match , replace string within escaped html while treating escape sequences (e.g. <
, '
, or 
) single characters, , not unescape html string during replacement process.
this means replacing
"lt"
"[lt]"
in"< lt"
result in"< [lt]"
(avoid match within entity)"<"
"[<]"
in"< lt"
result in"[<] lt"
(match entity)"&l"
"[&l]"
in"< <"
result in"< [&l]t"
(not match partial entity)"t;"
"[t;]"
in"< lt;"
result in"< l[t;]"
(not match partial entity)"< l"
"[< l]"
in"< lt"
result in"[< l]t"
(match including entity)"lt; &l"
"[lt; &l]"
in"< <"
result in"< <"
(not match partial entity)"t; <"
"[t; <]"
in"lt; <"
result in"l[t; <]"
(match including entity)"t; <"
"[t; <]"
in"lt; <"
result in"lt; <"
(not match partial entity)
with following regex capturing escaped sequences (e.g. <
, '
, or 
),
/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi
we may use following function starting point handles of cases above (#1, #2, #4, #5, , #7):
function searchandreplace(searchfor, replacement, str) { return str.replace( new regexp( prepare(searchfor) + "|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities "gi" ), function(m, entity) { return entity || replacement; } ); } function prepare(str) { return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1] } // [1] http://eloquentjavascript.net/09_regexp.html#h_rhu25fogrg
the remaining cases (#3, #6, #8) involve potential partial escaped sequence @ end of search string.
a solution check searchfor
string potential partial escaped sequences @ end , append corresponding negated lookahead (?!)
prevent matching valid escaped sequence. full solution (passing set of 40 test cases) shown below, , should faster , less complex .exec()
approach:
function searchandreplace(searchfor, replacement, str) { return str.replace( new regexp( prepare(searchfor) + "|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)", "gi" ), function(m, entity) { return entity || replacement; } ); } function prepare(str) { var add = ""; if (/&$/.test(str)) { add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)"; } else if (/&[a-z]+$/i.test(str)) { add = "(?![a-z]*;)"; } else if (/&#$/.test(str)) { add = "(?!x[a-f\\d]+;|\\d+;)"; } else if (/&#x$/.test(str)) { add = "(?![a-f\\d]+;)"; } else if (/&#x[a-f\d]+$/i.test(str)) { add = "(?![a-f\\d]*;)"; } return str.replace(/[^\w\s]/g, "\\$&") + add; } // test function function test(searchfor, replacement, str, expected) { var result = searchandreplace(searchfor, replacement, str); console.log( searchfor + ": " + (result === expected ? "passed" : "failed: " + [expected, result]) ); } // test cases test("lt", "[lt]", "<img border='0' />", "<img border='0' />"); test("39", "[39]", "<img border='0' />", "<img border='0' />"); test("'", "[']", "<img border='0' />", "<img border=[']0['] />"); test("border='", "[border=']", "<img border='0' />", "<img [border=']0' />"); test("39&", "[39&]", "39<img border=39'>>&' t; 0'&39; />", "39<img border=39'>>&' t; 0'&39; />") test("0&#", "[0&#]", "39<img border=39'>>&' t; 0'&39; />", "39<img border=39'>>&' t; 0'&39; />") test("lt", "[]", "<<t;t&l", "&[]<t;t&l"); test("<", "[]", "<<t;t&l", "<[]t;t&l"); test("&l", "[]", "<<t;t&l", "[]t<t;t[]"); test("t;", "[]", "<<t;t&l", "<<[]t&l"); test("t&", "[]", "<<t;t&l", "<<t;[]l"); test("<t", "[]", "<<t;t&l", "<[];t&l"); test("t<", "[]", "<<t;t&l", "&l[]t;t&l"); test("t;t", "[]", "<<t;t&l", "<<[]&l"); test("t&l", "[]", "<<t;t&l", "<<t;[]"); test("39", "[]", "''9;9&#", "�[]'9;9&#"); test("'", "[]", "''9;9&#", "'[]9;9&#"); test("&", "[]", "''9;9&#", "[]#039'9;9[]#"); test("&#", "[]", "''9;9&#", "[]039'9;9[]"); test("9;", "[]", "''9;9&#", "''[]9&#"); test("9&", "[]", "''9;9&#", "''9;[]#"); test("'9", "[]", "''9;9&#", "'[];9&#"); test("9'", "[]", "''9;9&#", "[]9;9&#"); test("9;9", "[]", "''9;9&#", "''[]&#"); test("9&#", "[]", "''9;9&#", "''9;[]"); test("x7", "[]", "f;f&#x", "&#[]ff;f&#x"); test("", "[]", "f;f&#x", "[]f;f&#x"); test("&", "[]", "f;f&#x", "[]#x7ff;f[]#x"); test("&#", "[]", "f;f&#x", "[]x7ff;f[]x"); test("&#x", "[]", "f;f&#x", "[]7ff;f[]"); test("", "[]", "f;f&#x", "[]ff;f&#x"); test("f;", "[]", "f;f&#x", "[]f&#x"); test("f&", "[]", "f;f&#x", "f;[]#x"); test("f", "[]", "f;f&#x", "[];f&#x"); test("f", "[]", "f;f&#x", "[]f;f&#x"); test("f;f", "[]", "f;f&#x", "[]&#x"); test("f&#", "[]", "f;f&#x", "f;[]x"); test("f&#x", "[]", "f;f&#x", "f;[]"); test("t; < lt &l", "[]", "< < lt <lt; < lt <", "< < lt <l[]t");
Comments
Post a Comment