Peter, thanks for your quick feedback on this. It makes me interested to learn more about the product. We definitely lean toward buying or using a third party solution than creating our own. However, for some reason I did not come across your toolkit when researching the topic. The best searches I could think of came up with a PHP utility called SafeHTML. That is because we _need_ to allow some HTML tags, like tables, images, etc. Yet, even SRC tags can contain javascript or vbscript baggage.
To that end, I create an algorithm that allows for a White List, and does a few Black List removals as well, based loosely on the PHP open source SafeHTML class.
The class is below. What I did to guard against style tag blacklist is defined here. Writing this class was not rocket science, so I don't care if someone else gets ahold of this and wants to use it. I am more concerned that we get a robust solution, and I think your toolkit may be the answer, but I want to ask you, do you think it will do this and more?
Thanks again!
/// <summary>
/// If a style sheet contains these words in its content, remove the tag completely.
/// </summary>
public static string[] StyleTagContentBlackList = new string[]
{
"expression", "@import", "behavior", "-moz-binding",
"url\\s*?\\(\\s*?j\\s*?a\\s*?v",
"url\\s*?\\(\\s*?v\\s*?b\\s*?s"
};
1 using System;
2 using System.Collections.Generic;
3 using System.Text;
4 using System.Text.RegularExpressions;
5 using System.IO;
6
7 using HtmlAgilityPack;
8 using Microsoft.Security.Application;
9
10 namespace WebApp.Utils
11 {
12 /// <summary>
13 /// Purifies HTML content by removing non-whitelisted tags, attributes, and explicitly
14 /// removing some tags, attributes, and attributes beginning with non-whitelisted protocol handlers.
15 /// </summary>
16 public static class SafeHtmlUtil
17 {
18 /// <summary>
19 /// Tags that will be allowed.
20 /// </summary>
21 public static string[] TagsWhiteList = new string[]
22 {
23 "a", "b", "u", "i", "strong", "em", "cite",
24 "area", "br", "img", "hr", "wbr", "address", "blockquote", "center",
25 "dd", "dir", "div", "dl", "dt", "h1", "h2", "h3", "h4", "h5", "h6",
26 "hr", "isindex", "listing", "marquee", "menu", "multicol", "ol", "p",
27 "plaintext", "pre", "table", "ul", "xmp",
28 "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr",
29 "dir", "menu", "ol", "ul", "dl"
30 };
31
32 /// <summary>
33 /// Tags to explicitly remove and remove all child content of as well.
34 /// </summary>
35 public static string[] TagsBlackListDeleteContent = new string[]
36 {
37 "script", "object", "embed", "title", "xml"
38 };
39
40 /// <summary>
41 /// Protocols to allow in attributes.
42 /// </summary>
43 public static string[] ProtocolsWhiteList = new string[]
44 {
45 "ed2k", "file", "ftp", "gopher", "http", "https", "irc", "mailto", "news",
46 "nntp", "telnet", "webcal", "xmpp", "callto"
47 };
48
49 /// <summary>
50 /// Attributes to examine for protocol handlers.
51 /// </summary>
52 public static string[] ProtocolsAttributes = new string[]
53 {
54 "action", "background", "codebase", "dynsrc", "href", "lowsrc", "src"
55 };
56
57 /// <summary>
58 /// Note that this could be a dangerous assumption that _only_ these tags should be
59 /// removed. In the future, browsers could introduce other dangerous tags and
60 /// this code would need to be updated.
61 /// </summary>
62 public static string[] AttributesBlackList = new string[]
63 {
64 "on", "data"
65 };
66
67 /// <summary>
68 /// If a style sheet contains these words in its content, remove the tag completely.
69 /// </summary>
70 public static string[] StyleTagContentBlackList = new string[]
71 {
72 "expression", "@import", "behavior", "-moz-binding",
73 "url\\s*?\\(\\s*?j\\s*?a\\s*?v",
74 "url\\s*?\\(\\s*?v\\s*?b\\s*?s"
75 };
76
77 private static Regex[] StyleTagContentBlackListPatterns;
78
79 static SafeHtmlUtil()
80 {
81 // Configure list of bad word patterns to trigger dangerous CSS text
82 StyleTagContentBlackListPatterns = new Regex[StyleTagContentBlackList.Length];
83
84 for (int i = 0; i < StyleTagContentBlackList.Length; i++)
85 {
86 string patText = StyleTagContentBlackList[i];
87
88 Regex pattern = new Regex(patText,
89 RegexOptions.IgnoreCase | RegexOptions.Singleline);
90
91 StyleTagContentBlackListPatterns[i] = pattern;
92 }
93 }
94
95 // **
96 // These are not used, but could be useful for something, so I've left them
97 // commented out for now.
98 // **
99
100 //public static string[] TagsBlackList = new string[]
101 // {
102 // "applet", "base", "basefont", "bgsound", "blink", "body", "embed",
103 // "frame", "frameset", "head", "html", "ilayer", "iframe", "layer",
104 // "link", "meta", "object", "style", "title", "script"
105 // };
106
107 //public static string[] ProtocolsBlackList = new string[]
108 // {
109 // "about", "chrome", "data", "disk", "hcp", "help", "javascript",
110 // "livescript", "lynxcgi", "lynxexec", "ms-help", "ms-its", "mhtml",
111 // "mocha", "opera", "res", "resource", "shell", "vbscript", "view-source",
112 // "vnd.ms.radio", "wysiwyg"
113 // };
114
115 /// <summary>
116 /// Returns a string of purified HTML.
117 /// </summary>
118 /// <param name="input">String of HTML to purify</param>
119 /// <returns></returns>
120 public static SafeHtmlParseResult GetSafeHtml(string input)
121 {
122 SafeHtmlParseResult result = new SafeHtmlParseResult();
123 result.Input = input;
124
125 try
126 {
127 List<string[]> removals = new List<string[]>();
128
129 HtmlDocument doc = new HtmlDocument();
130
131 doc.LoadHtml(input);
132
133 System.IO.TextWriter writer;
134 StringBuilder sb = new StringBuilder();
135 writer = new StringWriter(sb);
136
137 GetSafeHtmlIter(doc.DocumentNode, writer, removals);
138
139 doc.LoadHtml(sb.ToString());
140
141 HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//*");
142
143 if (null != nodes)
144 {
145 foreach (HtmlNode node in nodes)
146 {
147 foreach (HtmlAttribute attr in node.Attributes)
148 {
149 // Strip style attributes with malicious code
150 if ("style".Equals(attr.Name.ToLower()))
151 {
152 foreach (Regex pattern in StyleTagContentBlackListPatterns)
153 {
154 string value = attr.Value;
155
156 if (pattern.IsMatch(value))
157 {
158 node.Attributes.Remove(attr);
159 removals.Add(new string[] { attr.Name, pattern.ToString() });
160 break;
161 }
162 }
163 }
164
165 // Remove black listed attributes completely
166 foreach (string attrPrefix in AttributesBlackList)
167 {
168 if (attr.Name.StartsWith(attrPrefix))
169 {
170 node.Attributes.Remove(attr);
171 removals.Add(new string[] { attr.Name, attrPrefix });
172 break;
173 }
174 }
175
176 // Strip attributes with black listed protocols
177 foreach (string protocol in ProtocolsWhiteList)
178 {
179 if (attr.Value.Trim().StartsWith(protocol))
180 {
181 node.Attributes.Remove(attr);
182 removals.Add(new string[] { attr.Name, protocol });
183 break;
184 }
185 }
186 }
187 }
188 }
189
190 result.Output = doc.DocumentNode.OuterHtml;
191
192 if (removals.Count > 0)
193 {
194 result.Removals = removals;
195 }
196 }
197 catch (Exception ex)
198 {
199 result.Exception = ex;
200 }
201
202 return result;
203 }
204
205 private static void GetSafeHtmlIter(HtmlNode node, TextWriter writer, List<string[]> removals)
206 {
207 bool found = false;
208 bool deleleteTagFound = false;
209
210 if (node.NodeType == HtmlNodeType.Comment ||
211 node.NodeType == HtmlNodeType.Text)
212 {
213 node.WriteTo(writer);
214 }
215
216 else
217 {
218 // Strip style tags with malicious code
219 if ("style".Equals(node.Name.ToLower()))
220 {
221 foreach (Regex pattern in StyleTagContentBlackListPatterns)
222 {
223 string innerHtml = node.InnerHtml.ToLower();
224
225 if (pattern.IsMatch(innerHtml))
226 {
227 deleleteTagFound = true;
228 removals.Add(new string[] { node.Name, pattern.ToString() });
229 break;
230 }
231 }
232 }
233
234 foreach (string tagName in TagsWhiteList)
235 {
236 if (node.Name.ToLower() == tagName.ToLower())
237 {
238 found = true;
239 break;
240 }
241 }
242 }
243
244 if (found)
245 {
246 WriteBeginTag(node, writer);
247 }
248
249 foreach (string tagName in TagsBlackListDeleteContent)
250 {
251
252 if (node.Name.ToLower() == tagName.ToLower())
253 {
254 deleleteTagFound = true;
255 removals.Add(new string[] { "Deleted tag and child content", node.Name });
256 break;
257 }
258 }
259
260 if (!deleleteTagFound)
261 {
262 foreach (HtmlNode childNode in node.ChildNodes)
263 {
264 GetSafeHtmlIter(childNode, writer, removals);
265 }
266 }
267
268 if (found)
269 {
270 WriteEndTag(node, writer);
271 }
272 }
273
274 private static void WriteBeginTag(HtmlNode node, TextWriter writer)
275 {
276 writer.Write("<" + node.Name);
277
278 if (node.Attributes.Count > 0)
279 {
280 writer.Write(" ");
281
282 string space = "";
283
284 foreach (HtmlAttribute attr in node.Attributes)
285 {
286 writer.Write(space + attr.Name + "=\"" + attr.Value + "\"");
287 space = " ";
288 }
289 }
290
291 if (node.ChildNodes.Count == 0)
292 {
293 writer.Write(" />");
294 }
295 else
296 {
297 writer.Write(">");
298 }
299 }
300
301 private static void WriteEndTag(HtmlNode node, TextWriter writer)
302 {
303 if (node.ChildNodes.Count != 0)
304 {
305 writer.Write("" + node.Name + ">");
306 }
307 }
308 }
309 }