640 likes | 805 Views
Fast and Precise Sanitizer Analysis with Bek. Pieter Hooimeijer Ben Livshits David Molnar Prateek Saxena Margus Veanes. 2011-08-10 USENIX Security. < img src =' some untrusted input '/>. < img src =' some untrusted input '/>. Question: What could possibly go wrong?.
E N D
Fast and Precise Sanitizer Analysis with Bek Pieter Hooimeijer Ben Livshits David Molnar PrateekSaxena MargusVeanes 2011-08-10 USENIX Security
<imgsrc='some untrusted input'/> Question: What could possibly go wrong?
<imgsrc='some untrusted input'/> Attacker:im.png' onload='javascript:...
<imgsrc='some untrusted input'/> Attacker:im.png' onload='javascript:...
<imgsrc='some untrusted input'/> Attacker:im.png' onload='javascript:... Result: <imgsrc='im.png' onload='javascri
<imgsrc='some untrusted input'/> Attacker:im.png' onload='javascript:... Result: <imgsrc='im.png' onload='javascri FAIL
' ' single quote html entity
some untrusted input Library A Name: Around for: Availability: HtmlEncode Years Readily available to C# developers
some untrusted input Library A Name: Around for: Availability: Library B Name: Around for: Availability: HtmlEncode Years Readily available to C# developers HtmlEncode Years Readily available to C# developers
Library A Name: Around for: Availability: Library B Name: Around for: Availability: HtmlEncode Years Readily available to C# developers HtmlEncode Years Readily available to C# developers ✔ ✘ ' ' ' '
MS AntiXSS .NET WebUtility private static string HtmlEncode(string input, booluseNamedEntities, MethodSpecificEncoderencoderTweak) { if (string.IsNullOrEmpty(input)) { return input; } if (characterValues == null) { InitialiseSafeList(); } if (useNamedEntities && namedEntities == null) { InitialiseNamedEntityList(); } // Setup a new character array for output. char[] inputAsArray = input.ToCharArray(); intoutputLength = 0; intinputLength = inputAsArray.Length; char[] encodedInput = new char[inputLength * 10]; SyncLock.EnterReadLock(); try { for (int i = 0; i < inputLength; i++) { char currentCharacter = inputAsArray[i]; intcurrentCodePoint = inputAsArray[i]; char[] tweekedValue; // Check for invalid values if (currentCodePoint == 0xFFFE || currentCodePoint == 0xFFFF) { throw new InvalidUnicodeValueException(currentCodePoint); } else if (char.IsHighSurrogate(currentCharacter)) { if (i + 1 == inputLength) { throw new InvalidSurrogatePairException(currentCharacter, '\0'); } // Now peak ahead and check if the following character is a low surrogate. char nextCharacter = inputAsArray[i + 1]; char nextCodePoint = inputAsArray[i + 1]; if (!char.IsLowSurrogate(nextCharacter)) { throw new InvalidSurrogatePairException(currentCharacter, nextCharacter); } // Look-ahead was good, so skip. i++; // Calculate the combined code point long combinedCodePoint = 0x10000 + ((currentCodePoint - 0xD800) * 0x400) + (nextCodePoint - 0xDC00); char[] encodedCharacter = SafeList.HashThenValueGenerator(combinedCodePoint); encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (char.IsLowSurrogate(currentCharacter)) { throw new InvalidSurrogatePairException('\0', currentCharacter); } else if (encoderTweak != null && encoderTweak(currentCharacter, out tweekedValue)) { for (int j = 0; j < tweekedValue.Length; j++) { encodedInput[outputLength++] = tweekedValue[j]; } } else if (useNamedEntities && namedEntities[currentCodePoint] != null) { char[] encodedCharacter = namedEntities[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (characterValues[currentCodePoint] != null) { // character needs to be encoded char[] encodedCharacter = characterValues[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else { // character does not need encoding encodedInput[outputLength++] = currentCharacter; } } } finally { SyncLock.ExitReadLock(); } return new string(encodedInput, 0, outputLength); } public static string HtmlEncode(string s) { if (s == null) return null; intnum = IndexOfHtmlEncodingChars(s, 0); if (num == -1) return s; StringBuilder builder=new StringBuilder(s.Length+5); int length = s.Length; intstartIndex = 0; Label_002A: if (num > startIndex) { builder.Append(s, startIndex, num-startIndex); } char ch = s[num]; if (ch > '>') { builder.Append("&#"); builder.Append(((int) ch).ToString(NumberFormatInfo.InvariantInfo)); builder.Append(';'); } else { char ch2 = ch; if (ch2 != '"') { switch (ch2) { case '<': builder.Append("<"); goto Label_00D5; case '=': goto Label_00D5; case '>': builder.Append(">"); goto Label_00D5; case '&': builder.Append("&"); goto Label_00D5; } } else { builder.Append("""); } } Label_00D5: startIndex = num + 1; if (startIndex < length) { num = IndexOfHtmlEncodingChars(s, startIndex); if (num != -1) { goto Label_002A; } builder.Append(s, startIndex, length-startIndex); } return builder.ToString(); }
MS AntiXSS .NET WebUtility private static string HtmlEncode(string input, booluseNamedEntities, MethodSpecificEncoderencoderTweak) { if (string.IsNullOrEmpty(input)) { return input; } if (characterValues == null) { InitialiseSafeList(); } if (useNamedEntities && namedEntities == null) { InitialiseNamedEntityList(); } // Setup a new character array for output. char[] inputAsArray = input.ToCharArray(); intoutputLength = 0; intinputLength = inputAsArray.Length; char[] encodedInput = new char[inputLength * 10]; SyncLock.EnterReadLock(); try { for (int i = 0; i < inputLength; i++) { char currentCharacter = inputAsArray[i]; intcurrentCodePoint = inputAsArray[i]; char[] tweekedValue; // Check for invalid values if (currentCodePoint == 0xFFFE || currentCodePoint == 0xFFFF) { throw new InvalidUnicodeValueException(currentCodePoint); } else if (char.IsHighSurrogate(currentCharacter)) { if (i + 1 == inputLength) { throw new InvalidSurrogatePairException(currentCharacter, '\0'); } // Now peak ahead and check if the following character is a low surrogate. char nextCharacter = inputAsArray[i + 1]; char nextCodePoint = inputAsArray[i + 1]; if (!char.IsLowSurrogate(nextCharacter)) { throw new InvalidSurrogatePairException(currentCharacter, nextCharacter); } // Look-ahead was good, so skip. i++; // Calculate the combined code point long combinedCodePoint = 0x10000 + ((currentCodePoint - 0xD800) * 0x400) + (nextCodePoint - 0xDC00); char[] encodedCharacter = SafeList.HashThenValueGenerator(combinedCodePoint); encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (char.IsLowSurrogate(currentCharacter)) { throw new InvalidSurrogatePairException('\0', currentCharacter); } else if (encoderTweak != null && encoderTweak(currentCharacter, out tweekedValue)) { for (int j = 0; j < tweekedValue.Length; j++) { encodedInput[outputLength++] = tweekedValue[j]; } } else if (useNamedEntities && namedEntities[currentCodePoint] != null) { char[] encodedCharacter = namedEntities[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else if (characterValues[currentCodePoint] != null) { // character needs to be encoded char[] encodedCharacter = characterValues[currentCodePoint]; encodedInput[outputLength++] = '&'; for (int j = 0; j < encodedCharacter.Length; j++) { encodedInput[outputLength++] = encodedCharacter[j]; } encodedInput[outputLength++] = ';'; } else { // character does not need encoding encodedInput[outputLength++] = currentCharacter; } } } finally { SyncLock.ExitReadLock(); } return new string(encodedInput, 0, outputLength); } public static string HtmlEncode(string s) { if (s == null) return null; intnum = IndexOfHtmlEncodingChars(s, 0); if (num == -1) return s; StringBuilder builder=new StringBuilder(s.Length+5); int length = s.Length; intstartIndex = 0; Label_002A: if (num > startIndex) { builder.Append(s, startIndex, num-startIndex); } char ch = s[num]; if (ch > '>') { builder.Append("&#"); builder.Append(((int) ch).ToString(NumberFormatInfo.InvariantInfo)); builder.Append(';'); } else { char ch2 = ch; if (ch2 != '"') { switch (ch2) { case '<': builder.Append("<"); goto Label_00D5; case '=': goto Label_00D5; case '>': builder.Append(">"); goto Label_00D5; case '&': builder.Append("&"); goto Label_00D5; } } else { builder.Append("""); } } Label_00D5: startIndex = num + 1; if (startIndex < length) { num = IndexOfHtmlEncodingChars(s, startIndex); if (num != -1) { goto Label_002A; } builder.Append(s, startIndex, length-startIndex); } return builder.ToString(); } • Same behavior on all inputs? • If not, what is a differentiating input? • Can it generate any known ‘bad’ outputs?
PHP Trunk Changes to html.c, 1999—2011 R7,841 April 1999 135 loc R309,482 March 2011 1693 loc
R32,564 September 2000 ENT_QUOTES introduced PHP Trunk Changes to html.c, 1999—2011 R7,841 April 1999 135 loc R309,482 March 2011 1693 loc
R32,564 September 2000 ENT_QUOTES introduced R242,949 September 2007 $double_encode=true PHP Trunk Changes to html.c, 1999—2011 R7,841 April 1999 135 loc R309,482 March 2011 1693 loc
PHP Trunk Changes to html.c, 1999—2011 • Safe to apply twice? • Safe to combine with other sanitizers?
Motivation • Writing string sanitizers correctly is difficult • There is no cheap way to identify problems with sanitizers • ‘Correctness’ is a moving target • What if we could say more aboutsanitizer behavior?
Contributions • Bek • Frontend: a small language for string manipulation; similar to how sanitizers are written today • Backend: a model based on symbolic finite transducerswith algorithms for analysis and code generation
Contributions • Bek • Frontend: a small language for string manipulation; similar to how sanitizers are written today • Backend: a model based on symbolic finite transducerswith algorithms for analysis and code generation • Evaluation • Converted sanitizers from a variety of sources • Checked properties like reversibility, idempotence, equivalence, and commutativity
Contributions • Bek • Frontend: a small language for string manipulation; similar to how sanitizers are written today • Backend: a model based on symbolic finite transducerswith algorithms for analysis and code generation • Evaluation • Converted sanitizers from a variety of sources • Checked properties like reversibility, idempotence, equivalence, and commutativity
Bek: Architecture s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Bek Program
Bek: Architecture Transformation Symbolic Finite Transducers s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program
Bek: Architecture Transformation Symbolic Finite Transducers Does it do the right thing? Counterexample “\' vs. \\'” Analysis s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program
Bek: Architecture Transformation Symbolic Finite Transducers Does it do the right thing? Counterexample “\' vs. \\'” Analysis s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program Code Gen Code Gen C# JavaScript C
Bek: Architecture Transformation Symbolic Finite Transducers Does it do the right thing? Counterexample “\' vs. \\'” Analysis s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program Code Gen Code Gen C# JavaScript C
A Bek Program: Escape Quotes t := iter(cins)[b:= false;] {case (!b&&cin"['\"]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); };
iterate over the characters in string s A Bek Program: Escape Quotes t := iter(cins)[b:= false;] {case (!b&&cin"['\"]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); };
while updating one boolean variable b iterate over the characters in string s A Bek Program: Escape Quotes t := iter(cins)[b:= false;] {case (!b&&cin"['\"]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); };
Bek: Architecture Transformation Symbolic Finite Transducers Does it do the right thing? Counterexample “\' vs. \\'” Analysis s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program Code Gen Code Gen C# JavaScript C
A Symbolic Finite Transducer symbolic predicates
A Symbolic Finite Transducer symbolic predicates output lists
Bek: Architecture Transformation Symbolic Finite Transducers Does it do the right thing? Counterexample “\' vs. \\'” Analysis s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program Code Gen Code Gen C# JavaScript C
Bek: Architecture Transformation Symbolic Finite Transducers Does it do the right thing? Counterexample “\' vs. \\'” Analysis s := iter(cint)[b:= false;] {case (!b&&cin"[\"\\]"):b:= false;yield('\\', c);case (c=='\\'):b:= !b;yield(c); case (true): b:= false; yield(c); }; Microsoft.Automata Z3 Bek Program Now what? Code Gen Code Gen C# JavaScript C
Equivalence Checking SFT Algorithms
Equivalence Checking SFT Algorithms AntiXSS.HtmlEncode WebUtility.HtmlEncode
Join Composition SFT Algorithms SFT A B SFT A SFT B in in out out
Join Composition SFT Algorithms SFT A B SFT A SFT B in in out out JavaScriptEncode(HtmlEncode(w)) HtmlEncode(JavaScriptEncode(w))
Pre-Image Computation Regular Language Regular Language S SFT A in
Pre-Image Computation Regular Language Regular Language S ? SFT A in
Contributions • Bek • Frontend: a small language for string manipulation; similar to how sanitizers are written today • Backend: a model based on symbolic finite transducerswith algorithms for analysis and code generation • Evaluation • Converted sanitizers from a variety of sources • Checked properties like reversibility, idempotence, equivalence, and commutativity
Some Questions • What features are needed to port existing sanitizers? • Can we check interesting properties on real sanitizers? • Will HtmlEnc implementations protect against XSS Cheat Sheet samples?