001 // Copyright (c) 2011, Mike Samuel
002 // All rights reserved.
003 //
004 // Redistribution and use in source and binary forms, with or without
005 // modification, are permitted provided that the following conditions
006 // are met:
007 //
008 // Redistributions of source code must retain the above copyright
009 // notice, this list of conditions and the following disclaimer.
010 // Redistributions in binary form must reproduce the above copyright
011 // notice, this list of conditions and the following disclaimer in the
012 // documentation and/or other materials provided with the distribution.
013 // Neither the name of the OWASP nor the names of its contributors may
014 // be used to endorse or promote products derived from this software
015 // without specific prior written permission.
016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027 // POSSIBILITY OF SUCH DAMAGE.
028
029 package org.owasp.html.examples;
030
031 import java.io.IOException;
032 import java.io.InputStreamReader;
033 import java.util.regex.Pattern;
034
035 import org.owasp.html.Handler;
036 import org.owasp.html.HtmlPolicyBuilder;
037 import org.owasp.html.HtmlSanitizer;
038 import org.owasp.html.HtmlStreamRenderer;
039 import org.owasp.html.PolicyFactory;
040
041 import com.google.common.base.Charsets;
042 import com.google.common.base.Predicate;
043 import com.google.common.base.Throwables;
044 import com.google.common.io.CharStreams;
045
046 /**
047 * Based on the
048 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy EBay example</a>.
049 * <blockquote>
050 * eBay (http://www.ebay.com/) is the most popular online auction site in the
051 * universe, as far as I can tell. It is a public site so anyone is allowed to
052 * post listings with rich HTML content. It's not surprising that given the
053 * attractiveness of eBay as a target that it has been subject to a few complex
054 * XSS attacks. Listings are allowed to contain much more rich content than,
055 * say, Slashdot- so it's attack surface is considerably larger. The following
056 * tags appear to be accepted by eBay (they don't publish rules):
057 * {@code <a>},...
058 * </blockquote>
059 */
060 public class EbayPolicyExample {
061
062 // Some common regular expression definitions.
063
064 // The 16 colors defined by the HTML Spec (also used by the CSS Spec)
065 private static final Pattern COLOR_NAME = Pattern.compile(
066 "(?:aqua|black|blue|fuchsia|gray|grey|green|lime|maroon|navy|olive|purple"
067 + "|red|silver|teal|white|yellow)");
068
069 // HTML/CSS Spec allows 3 or 6 digit hex to specify color
070 private static final Pattern COLOR_CODE = Pattern.compile(
071 "(?:#(?:[0-9a-fA-F]{3}(?:[0-9a-fA-F]{3})?))");
072
073 private static final Pattern NUMBER_OR_PERCENT = Pattern.compile(
074 "[0-9]+%?");
075 private static final Pattern PARAGRAPH = Pattern.compile(
076 "(?:[\\p{L}\\p{N},'\\.\\s\\-_\\(\\)]|&[0-9]{2};)*");
077 private static final Pattern HTML_ID = Pattern.compile(
078 "[a-zA-Z0-9\\:\\-_\\.]+");
079 // force non-empty with a '+' at the end instead of '*'
080 private static final Pattern HTML_TITLE = Pattern.compile(
081 "[\\p{L}\\p{N}\\s\\-_',:\\[\\]!\\./\\\\\\(\\)&]*");
082 private static final Pattern HTML_CLASS = Pattern.compile(
083 "[a-zA-Z0-9\\s,\\-_]+");
084
085 private static final Pattern ONSITE_URL = Pattern.compile(
086 "(?:[\\p{L}\\p{N}\\\\\\.\\#@\\$%\\+&;\\-_~,\\?=/!]+|\\#(\\w)+)");
087 private static final Pattern OFFSITE_URL = Pattern.compile(
088 "\\s*(?:(?:ht|f)tps?://|mailto:)[\\p{L}\\p{N}]"
089 + "[\\p{L}\\p{N}\\p{Zs}\\.\\#@\\$%\\+&;:\\-_~,\\?=/!\\(\\)]*+\\s*");
090
091 private static final Pattern NUMBER = Pattern.compile(
092 "[+-]?(?:(?:[0-9]+(?:\\.[0-9]*)?)|\\.[0-9]+)");
093
094 private static final Pattern NAME = Pattern.compile("[a-zA-Z0-9\\-_\\$]+");
095
096 private static final Pattern ALIGN = Pattern.compile(
097 "(?i)center|left|right|justify|char");
098
099 private static final Pattern VALIGN = Pattern.compile(
100 "(?i)baseline|bottom|middle|top");
101
102 private static final Predicate<String> COLOR_NAME_OR_COLOR_CODE
103 = new Predicate<String>() {
104 public boolean apply(String s) {
105 return COLOR_NAME.matcher(s).matches()
106 || COLOR_CODE.matcher(s).matches();
107 }
108 };
109
110 private static final Predicate<String> ONSITE_OR_OFFSITE_URL
111 = new Predicate<String>() {
112 public boolean apply(String s) {
113 return ONSITE_URL.matcher(s).matches()
114 || OFFSITE_URL.matcher(s).matches();
115 }
116 };
117
118 private static final Pattern HISTORY_BACK = Pattern.compile(
119 "(?:javascript:)?\\Qhistory.go(-1)\\E");
120
121 private static final Pattern ONE_CHAR = Pattern.compile(
122 ".?", Pattern.DOTALL);
123
124
125
126 public static final PolicyFactory POLICY_DEFINITION = new HtmlPolicyBuilder()
127 .allowAttributes("id").matching(HTML_ID).globally()
128 .allowAttributes("class").matching(HTML_CLASS).globally()
129 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
130 .globally()
131 .allowAttributes("title").matching(HTML_TITLE).globally()
132 .allowStyling()
133 .allowAttributes("align").matching(ALIGN).onElements("p")
134 .allowAttributes("for").matching(HTML_ID).onElements("label")
135 .allowAttributes("color").matching(COLOR_NAME_OR_COLOR_CODE)
136 .onElements("font")
137 .allowAttributes("face")
138 .matching(Pattern.compile("[\\w;, \\-]+"))
139 .onElements("font")
140 .allowAttributes("size").matching(NUMBER).onElements("font")
141 .allowAttributes("href").matching(ONSITE_OR_OFFSITE_URL)
142 .onElements("a")
143 .allowStandardUrlProtocols()
144 .allowAttributes("nohref").onElements("a")
145 .allowAttributes("name").matching(NAME).onElements("a")
146 .allowAttributes(
147 "onfocus", "onblur", "onclick", "onmousedown", "onmouseup")
148 .matching(HISTORY_BACK).onElements("a")
149 .requireRelNofollowOnLinks()
150 .allowAttributes("src").matching(ONSITE_OR_OFFSITE_URL)
151 .onElements("img")
152 .allowAttributes("name").matching(NAME)
153 .onElements("img")
154 .allowAttributes("alt").matching(PARAGRAPH)
155 .onElements("img")
156 .allowAttributes("border", "hspace", "vspace").matching(NUMBER)
157 .onElements("img")
158 .allowAttributes("border", "cellpadding", "cellspacing")
159 .matching(NUMBER).onElements("table")
160 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
161 .onElements("table")
162 .allowAttributes("background").matching(ONSITE_URL)
163 .onElements("table")
164 .allowAttributes("align").matching(ALIGN)
165 .onElements("table")
166 .allowAttributes("noresize").matching(Pattern.compile("(?i)noresize"))
167 .onElements("table")
168 .allowAttributes("background").matching(ONSITE_URL)
169 .onElements("td", "th", "tr")
170 .allowAttributes("bgcolor").matching(COLOR_NAME_OR_COLOR_CODE)
171 .onElements("td", "th")
172 .allowAttributes("abbr").matching(PARAGRAPH)
173 .onElements("td", "th")
174 .allowAttributes("axis", "headers").matching(NAME)
175 .onElements("td", "th")
176 .allowAttributes("scope")
177 .matching(Pattern.compile("(?i)(?:row|col)(?:group)?"))
178 .onElements("td", "th")
179 .allowAttributes("nowrap")
180 .onElements("td", "th")
181 .allowAttributes("height", "width").matching(NUMBER_OR_PERCENT)
182 .onElements("table", "td", "th", "tr", "img")
183 .allowAttributes("align").matching(ALIGN)
184 .onElements("thead", "tbody", "tfoot", "img",
185 "td", "th", "tr", "colgroup", "col")
186 .allowAttributes("valign").matching(VALIGN)
187 .onElements("thead", "tbody", "tfoot",
188 "td", "th", "tr", "colgroup", "col")
189 .allowAttributes("charoff").matching(NUMBER_OR_PERCENT)
190 .onElements("td", "th", "tr", "colgroup", "col",
191 "thead", "tbody", "tfoot")
192 .allowAttributes("char").matching(ONE_CHAR)
193 .onElements("td", "th", "tr", "colgroup", "col",
194 "thead", "tbody", "tfoot")
195 .allowAttributes("colspan", "rowspan").matching(NUMBER)
196 .onElements("td", "th")
197 .allowAttributes("span", "width").matching(NUMBER_OR_PERCENT)
198 .onElements("colgroup", "col")
199 .allowElements(
200 "a", "label", "noscript", "h1", "h2", "h3", "h4", "h5", "h6",
201 "p", "i", "b", "u", "strong", "em", "small", "big", "pre", "code",
202 "cite", "samp", "sub", "sup", "strike", "center", "blockquote",
203 "hr", "br", "col", "font", "map", "span", "div", "img",
204 "ul", "ol", "li", "dd", "dt", "dl", "tbody", "thead", "tfoot",
205 "table", "td", "th", "tr", "colgroup", "fieldset", "legend")
206 .toFactory();
207
208 public static void main(String[] args) throws IOException {
209 if (args.length != 0) {
210 System.err.println("Reads from STDIN and writes to STDOUT");
211 System.exit(-1);
212 }
213 System.err.println("[Reading from STDIN]");
214 // Fetch the HTML to sanitize.
215 String html = CharStreams.toString(
216 new InputStreamReader(System.in, Charsets.UTF_8));
217 // Set up an output channel to receive the sanitized HTML.
218 HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
219 System.out,
220 // Receives notifications on a failure to write to the output.
221 new Handler<IOException>() {
222 public void handle(IOException ex) {
223 Throwables.propagate(ex); // System.out suppresses IOExceptions
224 }
225 },
226 // Our HTML parser is very lenient, but this receives notifications on
227 // truly bizarre inputs.
228 new Handler<String>() {
229 public void handle(String x) {
230 throw new AssertionError(x);
231 }
232 });
233 // Use the policy defined above to sanitize the HTML.
234 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
235 }
236 }