//------------------------------------------------------------------------------
//
// Copyright (c) Telligent Systems Corporation. All rights reserved.
//
//------------------------------------------------------------------------------
//refactored by Rob Conery on 8/9/2007 for use with SubSonic Forums... hope this is OK :)
using System;
using System.Collections.Specialized;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Web.Caching;
using System.Xml;
namespace SubSonic.Forums.Data {
///
/// Validates Html only letting a predefined set of Elemtnts/Attributes remain.
///
public class HtmlScrubber {
#region Static Default
static Dictionary> DefaultTags() {
Dictionary> defaultTags = new Dictionary>();
defaultTags.Add("h1", ParseAttributeList("align"));
defaultTags.Add("h2", ParseAttributeList("align"));
defaultTags.Add("h3", ParseAttributeList("align"));
defaultTags.Add("h4", ParseAttributeList("align"));
defaultTags.Add("h5", ParseAttributeList("align"));
defaultTags.Add("h6", ParseAttributeList("align"));
defaultTags.Add("strong", ParseAttributeList(""));
defaultTags.Add("em", ParseAttributeList(""));
defaultTags.Add("u", ParseAttributeList(""));
defaultTags.Add("b", ParseAttributeList(""));
defaultTags.Add("i", ParseAttributeList(""));
defaultTags.Add("strike", ParseAttributeList(""));
defaultTags.Add("sup", ParseAttributeList(""));
defaultTags.Add("sub", ParseAttributeList(""));
defaultTags.Add("font", ParseAttributeList("color,size,face"));
defaultTags.Add("blockquote", ParseAttributeList("dir"));
defaultTags.Add("ul", ParseAttributeList(""));
defaultTags.Add("ol", ParseAttributeList(""));
defaultTags.Add("li", ParseAttributeList(""));
defaultTags.Add("p", ParseAttributeList("class,align,dir"));
defaultTags.Add("address", ParseAttributeList(""));
defaultTags.Add("pre", ParseAttributeList("class"));
defaultTags.Add("div", ParseAttributeList("align"));
defaultTags.Add("hr", ParseAttributeList("id"));
defaultTags.Add("br", ParseAttributeList(""));
defaultTags.Add("a", ParseAttributeList("href,target,name"));
defaultTags.Add("span", ParseAttributeList("align"));
defaultTags.Add("img", ParseAttributeList("src,alt,title"));
return defaultTags;
}
static Dictionary ParseAttributeList(string attributeList) {
Dictionary atts = new Dictionary();
foreach (string att in attributeList.Split(',')) {
if (!string.IsNullOrEmpty(att))
atts[att.ToLower()] = true;
}
return atts;
}
static Dictionary DefaultSelfContainedTags() {
Dictionary defaultTags = new Dictionary();
defaultTags.Add("br", true);
defaultTags.Add("img", true);
defaultTags.Add("input", true);
defaultTags.Add("meta", true);
defaultTags.Add("base", true);
defaultTags.Add("hr", true);
return defaultTags;
}
protected Dictionary> allowedTags = null;
protected Dictionary selfContainedTags = null;
static Regex regex = new Regex("<[^<>]+>?", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Multiline);
static Regex jsAttributeRegex = new Regex("javascript:", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled);
static Regex xmlLineBreak = new Regex("([DA9]|20|85|2028|0A|0D)(;)?", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Compiled);
static Regex filterdCharacters = new Regex("\\=|\\\"|\\'|\\s", RegexOptions.Compiled);
static Regex validProtocols = new Regex("^((http(s)?|mailto|mms):|/)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
static Regex bannedChars = new Regex("\\s", RegexOptions.Compiled);
static Regex styleProperty = new Regex(@"^[a-z\-]*$", RegexOptions.IgnoreCase | RegexOptions.Compiled);
static Regex styleValue = new Regex(@"^(\#?[a-z0-9\.\-\,\']*\%?|\s+|url\(\'?(?:(?:http(s)?|mailto|mms):|/)[^\;\:\)\<\>\n\r\*\""\'\\]*?\'?\))+$", RegexOptions.IgnoreCase | RegexOptions.Compiled);
#endregion
#region Private members
string input = null;
StringBuilder output = new StringBuilder();
bool cleanJS = false;
bool isFormatted = false;
bool encodeExceptions = false;
#endregion
#region Cleaners
///
/// Returns the results of a cleaning.
///
///
public string Clean() {
if (!isFormatted) {
Format();
isFormatted = true;
}
return output.ToString();
}
#endregion
#region Format / Walk
string EnsureHtmlEncode(string s) {
string result = "";
if(!String.IsNullOrEmpty(s))
result = System.Web.HttpUtility.HtmlEncode(s);
return result;
}
///
/// Walks one time through the HTML. All elements/tags are validated.
/// The rest of the text is simply added to the internal queue
///
protected virtual void Format() {
int lastEndIndex = 0;
Match match = regex.Match(input);
while (match.Value != string.Empty) {
if (lastEndIndex != match.Index) {
// add whatever text was between the tags
output.Append(EnsureHtmlEncode(input.Substring(lastEndIndex, match.Index - lastEndIndex)));
}
output.Append(Validate(match.Value));
//Get the next match
lastEndIndex = match.Index + match.Length;
match = regex.Match(input, lastEndIndex);
}
// add whatever text exists after the last tag
if (lastEndIndex < input.Length)
output.Append(EnsureHtmlEncode(input.Substring(lastEndIndex)));
}
#endregion
#region Validators
///
/// Main method for starting element validation
///
///
///
protected string Validate(string tag) {
if (tag.StartsWith(""))
return ValidateEndTag(tag);
if (tag.EndsWith("/>"))
return ValidateSingleTag(tag);
return ValidateStartTag(tag);
}
///
/// Validates single element tags such as
and
///
private string ValidateSingleTag(string tag) {
string strip = tag.Substring(1, tag.Length - 3).Trim();
int index = strip.IndexOfAny(new char[] { ' ', '\r', '\n' });
if (index == -1)
index = strip.Length;
string tagName = strip.Substring(0, index).ToLower();
Dictionary allowedAttributes = null;
if (!allowedTags.TryGetValue(tagName, out allowedAttributes))
return encodeExceptions ? EnsureHtmlEncode(tag) : string.Empty;
string atts = strip.Substring(tagName.Length).Trim();
return ValidateAttributes(allowedAttributes, atts, tagName, "<{0}{1} />");
}
///
/// Validates a start tag
///
///
/// the tag and validate attributes
protected virtual string ValidateStartTag(string tag) {
//Check for potential attributes
int endIndex = tag.IndexOfAny(new char[] { ' ', '\r', '\n' });
//simple tag
if (endIndex == -1)
endIndex = tag.Length - 1;
//Grab the tab name
string tagName = tag.Substring(1, endIndex - 1).ToLower();
//Use safe incase a : is present
Dictionary allowedAttributes = null;
if (!allowedTags.TryGetValue(tagName, out allowedAttributes)) {
//If we do not find a record in the Hashtable, this tag is not valid
return encodeExceptions ? EnsureHtmlEncode(tag) : string.Empty; //remove element and all attributes if not valid
}
//remove the tag name and find all of the current element's attributes
int start = tagName.Length;
string attributes = tag.Substring(start + 1, (tag.Length - (start + 2)));
//if we have attributes, make sure there is no extra padding in the way
if (attributes != null)
attributes = attributes.Trim();
//Validate the attributes
if (selfContainedTags.ContainsKey(tagName))
return ValidateAttributes(allowedAttributes, attributes, tagName, "<{0}{1} />");
else
return ValidateAttributes(allowedAttributes, attributes, tagName, "<{0}{1}>");
}
///
/// Validates the elements attribute collection
///
///
///
///
///
protected virtual string ValidateAttributes(Dictionary allowedAttributes, string tagAttributes, string tagName, string tagFormat) {
//container for attributes.
StringBuilder atts = new StringBuilder();
bool hasAlt = false;
//Do we even have any attributes to validate?
if (allowedAttributes.Count > 0) {
tagAttributes = xmlLineBreak.Replace(tagAttributes, string.Empty);
for (int start = 0, end = 0; start < tagAttributes.Length; start = end) {
//Put the end index at the end of the attribute name.
end = tagAttributes.IndexOf('=', start);
if (end < 0)
end = tagAttributes.Length;
//Get the attribute name and see if it's allowed.
string att = tagAttributes.Substring(start, end - start).Trim();
bool allowed = allowedAttributes.ContainsKey(att.ToLower());
//Now advance the end index to include the attribute value.
if (end < tagAttributes.Length) {
//Skip any blanks after the '='.
for (++end; end < tagAttributes.Length && (tagAttributes[end] == ' ' || tagAttributes[end] == '\r' || tagAttributes[end] == '\n'); ++end) ;
if (end < tagAttributes.Length) {
//Find the end of the value.
end = tagAttributes[end] == '"' //Quoted with double quotes?
? tagAttributes.IndexOf('"', end + 1)
: tagAttributes[end] == '\'' //Quoted with single quotes?
? tagAttributes.IndexOf('\'', end + 1)
: tagAttributes.IndexOfAny(new char[] { ' ', '\r', '\n' } , end); //Otherwise, assume not quoted.
//If we didn't find the terminating character, just go to the end of the string.
//Otherwise, advance the end index past the terminating character.
end = end < 0 ? tagAttributes.Length : end + 1;
}
}
//If the attribute is allowed, copy it.
if (allowed) {
att = att.ToLower();
string attValue = tagAttributes.Substring(start, end - start).Trim();
attValue = attValue.Substring(att.Length).Trim().Substring(1).Trim();
if (attValue.Substring(0, 1) == "\"" || attValue.Substring(0, 1) == "'")
attValue = attValue.Substring(1, attValue.Length - 2);
//Special actions on these attributes. IE will render just about anything that looks like the word javascript:
//this includes line breaks, special characters codes, etc.
if (att == "src" || att == "href") {
attValue = EnsureHtmlEncode(attValue);
//Encode spaces
attValue = attValue.Replace(" ", "%20");
//validate only http, https, mailto, and / (relative) requests are made
if (validProtocols.IsMatch(attValue)) {
atts.Append(" ");
atts.Append(att);
atts.Append("=\"");
atts.Append(attValue);
atts.Append("\"");
}
//If the "if" above fails, we do not render the attribute!
} else if (att == "style") {
// convert to string builder for replacements
StringBuilder oldAttValue = new StringBuilder(attValue);
// replace double quot w/ single quote
oldAttValue.Replace(""", "'");
// replace #34 w/ single quote
oldAttValue.Replace(""", "'");
// replace #39 w/ single quote
oldAttValue.Replace("'", "'");
string[] nameValues = oldAttValue.ToString().Split(';');
StringBuilder newAttValue = new StringBuilder();
for (int i = 0; i < nameValues.Length; i++) {
string[] nameValue = nameValues[i].Split(':');
if (nameValue.Length == 2) {
nameValue[0] = nameValue[0].Trim();
nameValue[1] = nameValue[1].Trim();
if (styleProperty.IsMatch(nameValue[0]) && styleValue.IsMatch(nameValue[1])) {
newAttValue.Append(nameValue[0]);
newAttValue.Append(":");
newAttValue.Append(nameValue[1]);
newAttValue.Append(";");
}
}
}
if (newAttValue.Length > 0) {
atts.Append(" style=\"");
atts.Append(EnsureHtmlEncode(newAttValue.ToString()));
atts.Append("\"");
}
} else {
if (att == "alt")
hasAlt = true;
atts.Append(" ");
atts.Append(att);
atts.Append("=\"");
atts.Append(EnsureHtmlEncode(attValue));
atts.Append("\"");
}
}
}
//Are we filtering for Javascript?
if (cleanJS)
atts = new StringBuilder(jsAttributeRegex.Replace(atts.ToString(), string.Empty));
}
if (tagName == "img" && !hasAlt)
atts.Append(" alt=\"\"");
return string.Format(tagFormat, tagName, atts.ToString());
}
///
/// Validate End/Closing tag
///
///
///
protected virtual string ValidateEndTag(string tag) {
string tagName = tag.Substring(2, tag.Length - 3).ToLower();
if (!allowedTags.ContainsKey(tagName))
return encodeExceptions ? EnsureHtmlEncode(tag) : string.Empty;
else if (selfContainedTags.ContainsKey(tagName))
return string.Empty;
else
return tag.ToLower();
}
#endregion
}
}