//------------------------------------------------------------------------------ // // Copyright (c) Telligent Systems Corporation. All rights reserved. // //------------------------------------------------------------------------------ //refactored by Rob Conery on 8/9/2007 for use with SubSonic Forums... hope this is OK :) using System; using System.Collections.Specialized; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Web.Caching; using System.Xml; namespace SubSonic.Forums.Data { /// /// Validates Html only letting a predefined set of Elemtnts/Attributes remain. /// public class HtmlScrubber { #region Static Default static Dictionary> DefaultTags() { Dictionary> defaultTags = new Dictionary>(); defaultTags.Add("h1", ParseAttributeList("align")); defaultTags.Add("h2", ParseAttributeList("align")); defaultTags.Add("h3", ParseAttributeList("align")); defaultTags.Add("h4", ParseAttributeList("align")); defaultTags.Add("h5", ParseAttributeList("align")); defaultTags.Add("h6", ParseAttributeList("align")); defaultTags.Add("strong", ParseAttributeList("")); defaultTags.Add("em", ParseAttributeList("")); defaultTags.Add("u", ParseAttributeList("")); defaultTags.Add("b", ParseAttributeList("")); defaultTags.Add("i", ParseAttributeList("")); defaultTags.Add("strike", ParseAttributeList("")); defaultTags.Add("sup", ParseAttributeList("")); defaultTags.Add("sub", ParseAttributeList("")); defaultTags.Add("font", ParseAttributeList("color,size,face")); defaultTags.Add("blockquote", ParseAttributeList("dir")); defaultTags.Add("ul", ParseAttributeList("")); defaultTags.Add("ol", ParseAttributeList("")); defaultTags.Add("li", ParseAttributeList("")); defaultTags.Add("p", ParseAttributeList("class,align,dir")); defaultTags.Add("address", ParseAttributeList("")); defaultTags.Add("pre", ParseAttributeList("class")); defaultTags.Add("div", ParseAttributeList("align")); defaultTags.Add("hr", ParseAttributeList("id")); defaultTags.Add("br", ParseAttributeList("")); defaultTags.Add("a", ParseAttributeList("href,target,name")); defaultTags.Add("span", ParseAttributeList("align")); defaultTags.Add("img", ParseAttributeList("src,alt,title")); return defaultTags; } static Dictionary ParseAttributeList(string attributeList) { Dictionary atts = new Dictionary(); foreach (string att in attributeList.Split(',')) { if (!string.IsNullOrEmpty(att)) atts[att.ToLower()] = true; } return atts; } static Dictionary DefaultSelfContainedTags() { Dictionary defaultTags = new Dictionary(); defaultTags.Add("br", true); defaultTags.Add("img", true); defaultTags.Add("input", true); defaultTags.Add("meta", true); defaultTags.Add("base", true); defaultTags.Add("hr", true); return defaultTags; } protected Dictionary> allowedTags = null; protected Dictionary selfContainedTags = null; static Regex regex = new Regex("<[^<>]+>?", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline); static Regex jsAttributeRegex = new Regex("javascript:", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled); static Regex xmlLineBreak = new Regex("&#x([DA9]|20|85|2028|0A|0D)(;)?", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled); static Regex filterdCharacters = new Regex("\\=|\\\"|\\'|\\s", RegexOptions.Compiled); static Regex validProtocols = new Regex("^((http(s)?|mailto|mms):|/)", RegexOptions.Compiled | RegexOptions.IgnoreCase); static Regex bannedChars = new Regex("\\s", RegexOptions.Compiled); static Regex styleProperty = new Regex(@"^[a-z\-]*$", RegexOptions.IgnoreCase | RegexOptions.Compiled); static Regex styleValue = new Regex(@"^(\#?[a-z0-9\.\-\,\']*\%?|\s+|url\(\'?(?:(?:http(s)?|mailto|mms):|/)[^\;\:\)\<\>\n\r\*\""\'\\]*?\'?\))+$", RegexOptions.IgnoreCase | RegexOptions.Compiled); #endregion #region Private members string input = null; StringBuilder output = new StringBuilder(); bool cleanJS = false; bool isFormatted = false; bool encodeExceptions = false; #endregion #region Cleaners /// /// Returns the results of a cleaning. /// /// public string Clean() { if (!isFormatted) { Format(); isFormatted = true; } return output.ToString(); } #endregion #region Format / Walk string EnsureHtmlEncode(string s) { string result = ""; if(!String.IsNullOrEmpty(s)) result = System.Web.HttpUtility.HtmlEncode(s); return result; } /// /// Walks one time through the HTML. All elements/tags are validated. /// The rest of the text is simply added to the internal queue /// protected virtual void Format() { int lastEndIndex = 0; Match match = regex.Match(input); while (match.Value != string.Empty) { if (lastEndIndex != match.Index) { // add whatever text was between the tags output.Append(EnsureHtmlEncode(input.Substring(lastEndIndex, match.Index - lastEndIndex))); } output.Append(Validate(match.Value)); //Get the next match lastEndIndex = match.Index + match.Length; match = regex.Match(input, lastEndIndex); } // add whatever text exists after the last tag if (lastEndIndex < input.Length) output.Append(EnsureHtmlEncode(input.Substring(lastEndIndex))); } #endregion #region Validators /// /// Main method for starting element validation /// /// /// protected string Validate(string tag) { if (tag.StartsWith("")) return ValidateSingleTag(tag); return ValidateStartTag(tag); } /// /// Validates single element tags such as
and
///
private string ValidateSingleTag(string tag) { string strip = tag.Substring(1, tag.Length - 3).Trim(); int index = strip.IndexOfAny(new char[] { ' ', '\r', '\n' }); if (index == -1) index = strip.Length; string tagName = strip.Substring(0, index).ToLower(); Dictionary allowedAttributes = null; if (!allowedTags.TryGetValue(tagName, out allowedAttributes)) return encodeExceptions ? EnsureHtmlEncode(tag) : string.Empty; string atts = strip.Substring(tagName.Length).Trim(); return ValidateAttributes(allowedAttributes, atts, tagName, "<{0}{1} />"); } /// /// Validates a start tag /// /// /// the tag and validate attributes protected virtual string ValidateStartTag(string tag) { //Check for potential attributes int endIndex = tag.IndexOfAny(new char[] { ' ', '\r', '\n' }); //simple tag if (endIndex == -1) endIndex = tag.Length - 1; //Grab the tab name string tagName = tag.Substring(1, endIndex - 1).ToLower(); //Use safe incase a : is present Dictionary allowedAttributes = null; if (!allowedTags.TryGetValue(tagName, out allowedAttributes)) { //If we do not find a record in the Hashtable, this tag is not valid return encodeExceptions ? EnsureHtmlEncode(tag) : string.Empty; //remove element and all attributes if not valid } //remove the tag name and find all of the current element's attributes int start = tagName.Length; string attributes = tag.Substring(start + 1, (tag.Length - (start + 2))); //if we have attributes, make sure there is no extra padding in the way if (attributes != null) attributes = attributes.Trim(); //Validate the attributes if (selfContainedTags.ContainsKey(tagName)) return ValidateAttributes(allowedAttributes, attributes, tagName, "<{0}{1} />"); else return ValidateAttributes(allowedAttributes, attributes, tagName, "<{0}{1}>"); } /// /// Validates the elements attribute collection /// /// /// /// /// protected virtual string ValidateAttributes(Dictionary allowedAttributes, string tagAttributes, string tagName, string tagFormat) { //container for attributes. StringBuilder atts = new StringBuilder(); bool hasAlt = false; //Do we even have any attributes to validate? if (allowedAttributes.Count > 0) { tagAttributes = xmlLineBreak.Replace(tagAttributes, string.Empty); for (int start = 0, end = 0; start < tagAttributes.Length; start = end) { //Put the end index at the end of the attribute name. end = tagAttributes.IndexOf('=', start); if (end < 0) end = tagAttributes.Length; //Get the attribute name and see if it's allowed. string att = tagAttributes.Substring(start, end - start).Trim(); bool allowed = allowedAttributes.ContainsKey(att.ToLower()); //Now advance the end index to include the attribute value. if (end < tagAttributes.Length) { //Skip any blanks after the '='. for (++end; end < tagAttributes.Length && (tagAttributes[end] == ' ' || tagAttributes[end] == '\r' || tagAttributes[end] == '\n'); ++end) ; if (end < tagAttributes.Length) { //Find the end of the value. end = tagAttributes[end] == '"' //Quoted with double quotes? ? tagAttributes.IndexOf('"', end + 1) : tagAttributes[end] == '\'' //Quoted with single quotes? ? tagAttributes.IndexOf('\'', end + 1) : tagAttributes.IndexOfAny(new char[] { ' ', '\r', '\n' } , end); //Otherwise, assume not quoted. //If we didn't find the terminating character, just go to the end of the string. //Otherwise, advance the end index past the terminating character. end = end < 0 ? tagAttributes.Length : end + 1; } } //If the attribute is allowed, copy it. if (allowed) { att = att.ToLower(); string attValue = tagAttributes.Substring(start, end - start).Trim(); attValue = attValue.Substring(att.Length).Trim().Substring(1).Trim(); if (attValue.Substring(0, 1) == "\"" || attValue.Substring(0, 1) == "'") attValue = attValue.Substring(1, attValue.Length - 2); //Special actions on these attributes. IE will render just about anything that looks like the word javascript: //this includes line breaks, special characters codes, etc. if (att == "src" || att == "href") { attValue = EnsureHtmlEncode(attValue); //Encode spaces attValue = attValue.Replace(" ", "%20"); //validate only http, https, mailto, and / (relative) requests are made if (validProtocols.IsMatch(attValue)) { atts.Append(" "); atts.Append(att); atts.Append("=\""); atts.Append(attValue); atts.Append("\""); } //If the "if" above fails, we do not render the attribute! } else if (att == "style") { // convert to string builder for replacements StringBuilder oldAttValue = new StringBuilder(attValue); // replace double quot w/ single quote oldAttValue.Replace(""", "'"); // replace #34 w/ single quote oldAttValue.Replace(""", "'"); // replace #39 w/ single quote oldAttValue.Replace("'", "'"); string[] nameValues = oldAttValue.ToString().Split(';'); StringBuilder newAttValue = new StringBuilder(); for (int i = 0; i < nameValues.Length; i++) { string[] nameValue = nameValues[i].Split(':'); if (nameValue.Length == 2) { nameValue[0] = nameValue[0].Trim(); nameValue[1] = nameValue[1].Trim(); if (styleProperty.IsMatch(nameValue[0]) && styleValue.IsMatch(nameValue[1])) { newAttValue.Append(nameValue[0]); newAttValue.Append(":"); newAttValue.Append(nameValue[1]); newAttValue.Append(";"); } } } if (newAttValue.Length > 0) { atts.Append(" style=\""); atts.Append(EnsureHtmlEncode(newAttValue.ToString())); atts.Append("\""); } } else { if (att == "alt") hasAlt = true; atts.Append(" "); atts.Append(att); atts.Append("=\""); atts.Append(EnsureHtmlEncode(attValue)); atts.Append("\""); } } } //Are we filtering for Javascript? if (cleanJS) atts = new StringBuilder(jsAttributeRegex.Replace(atts.ToString(), string.Empty)); } if (tagName == "img" && !hasAlt) atts.Append(" alt=\"\""); return string.Format(tagFormat, tagName, atts.ToString()); } /// /// Validate End/Closing tag /// /// /// protected virtual string ValidateEndTag(string tag) { string tagName = tag.Substring(2, tag.Length - 3).ToLower(); if (!allowedTags.ContainsKey(tagName)) return encodeExceptions ? EnsureHtmlEncode(tag) : string.Empty; else if (selfContainedTags.ContainsKey(tagName)) return string.Empty; else return tag.ToLower(); } #endregion } }